sys/ufs/ffs/ffs_softdep.c

   1 /*-
   2  * Copyright 1998, 2000 Marshall Kirk McKusick.
   3  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
   4  * All rights reserved.
   5  *
   6  * The soft updates code is derived from the appendix of a University
   7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
   8  * "Soft Updates: A Solution to the Metadata Update Problem in File
   9  * Systems", CSE-TR-254-95, August 1995).
  10  *
  11  * Further information about soft updates can be obtained from:
  12  *
  13  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  14  *      1614 Oxford Street              mckusick@mckusick.com
  15  *      Berkeley, CA 94709-1608         +1-510-843-9542
  16  *      USA
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  *
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  31  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
  36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38  *
  39  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  40  */
  41
  42 #include <sys/cdefs.h>
  43 __FBSDID("$FreeBSD$");
  44
  45 #include "opt_ffs.h"
  46 #include "opt_quota.h"
  47 #include "opt_ddb.h"
  48
  49 /*
  50  * For now we want the safety net that the DEBUG flag provides.
  51  */
  52 #ifndef DEBUG
  53 #define DEBUG
  54 #endif
  55
  56 #include <sys/param.h>
  57 #include <sys/kernel.h>
  58 #include <sys/systm.h>
  59 #include <sys/bio.h>
  60 #include <sys/buf.h>
  61 #include <sys/kdb.h>
  62 #include <sys/kthread.h>
  63 #include <sys/ktr.h>
  64 #include <sys/limits.h>
  65 #include <sys/lock.h>
  66 #include <sys/malloc.h>
  67 #include <sys/mount.h>
  68 #include <sys/mutex.h>
  69 #include <sys/namei.h>
  70 #include <sys/priv.h>
  71 #include <sys/proc.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/stat.h>
  74 #include <sys/sysctl.h>
  75 #include <sys/syslog.h>
  76 #include <sys/vnode.h>
  77 #include <sys/conf.h>
  78
  79 #include <ufs/ufs/dir.h>
  80 #include <ufs/ufs/extattr.h>
  81 #include <ufs/ufs/quota.h>
  82 #include <ufs/ufs/inode.h>
  83 #include <ufs/ufs/ufsmount.h>
  84 #include <ufs/ffs/fs.h>
  85 #include <ufs/ffs/softdep.h>
  86 #include <ufs/ffs/ffs_extern.h>
  87 #include <ufs/ufs/ufs_extern.h>
  88
  89 #include <vm/vm.h>
  90 #include <vm/vm_extern.h>
  91 #include <vm/vm_object.h>
  92
  93 #include <geom/geom.h>
  94
  95 #include <ddb/ddb.h>
  96
  97 #define KTR_SUJ 0       /* Define to KTR_SPARE. */
  98
  99 #ifndef SOFTUPDATES
 100
 101 int
 102 softdep_flushfiles(oldmnt, flags, td)
 103         struct mount *oldmnt;
 104         int flags;
 105         struct thread *td;
 106 {
 107
 108         panic("softdep_flushfiles called");
 109 }
 110
 111 int
 112 softdep_mount(devvp, mp, fs, cred)
 113         struct vnode *devvp;
 114         struct mount *mp;
 115         struct fs *fs;
 116         struct ucred *cred;
 117 {
 118
 119         return (0);
 120 }
 121
 122 void
 123 softdep_initialize()
 124 {
 125
 126         return;
 127 }
 128
 129 void
 130 softdep_uninitialize()
 131 {
 132
 133         return;
 134 }
 135
 136 void
 137 softdep_unmount(mp)
 138         struct mount *mp;
 139 {
 140
 141         panic("softdep_unmount called");
 142 }
 143
 144 void
 145 softdep_setup_sbupdate(ump, fs, bp)
 146         struct ufsmount *ump;
 147         struct fs *fs;
 148         struct buf *bp;
 149 {
 150
 151         panic("softdep_setup_sbupdate called");
 152 }
 153
 154 void
 155 softdep_setup_inomapdep(bp, ip, newinum, mode)
 156         struct buf *bp;
 157         struct inode *ip;
 158         ino_t newinum;
 159         int mode;
 160 {
 161
 162         panic("softdep_setup_inomapdep called");
 163 }
 164
 165 void
 166 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 167         struct buf *bp;
 168         struct mount *mp;
 169         ufs2_daddr_t newblkno;
 170         int frags;
 171         int oldfrags;
 172 {
 173
 174         panic("softdep_setup_blkmapdep called");
 175 }
 176
 177 void
 178 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 179         struct inode *ip;
 180         ufs_lbn_t lbn;
 181         ufs2_daddr_t newblkno;
 182         ufs2_daddr_t oldblkno;
 183         long newsize;
 184         long oldsize;
 185         struct buf *bp;
 186 {
 187
 188         panic("softdep_setup_allocdirect called");
 189 }
 190
 191 void
 192 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 193         struct inode *ip;
 194         ufs_lbn_t lbn;
 195         ufs2_daddr_t newblkno;
 196         ufs2_daddr_t oldblkno;
 197         long newsize;
 198         long oldsize;
 199         struct buf *bp;
 200 {
 201
 202         panic("softdep_setup_allocext called");
 203 }
 204
 205 void
 206 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 207         struct inode *ip;
 208         ufs_lbn_t lbn;
 209         struct buf *bp;
 210         int ptrno;
 211         ufs2_daddr_t newblkno;
 212         ufs2_daddr_t oldblkno;
 213         struct buf *nbp;
 214 {
 215
 216         panic("softdep_setup_allocindir_page called");
 217 }
 218
 219 void
 220 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 221         struct buf *nbp;
 222         struct inode *ip;
 223         struct buf *bp;
 224         int ptrno;
 225         ufs2_daddr_t newblkno;
 226 {
 227
 228         panic("softdep_setup_allocindir_meta called");
 229 }
 230
 231 void
 232 softdep_journal_freeblocks(ip, cred, length, flags)
 233         struct inode *ip;
 234         struct ucred *cred;
 235         off_t length;
 236         int flags;
 237 {
 238
 239         panic("softdep_journal_freeblocks called");
 240 }
 241
 242 void
 243 softdep_journal_fsync(ip)
 244         struct inode *ip;
 245 {
 246
 247         panic("softdep_journal_fsync called");
 248 }
 249
 250 void
 251 softdep_setup_freeblocks(ip, length, flags)
 252         struct inode *ip;
 253         off_t length;
 254         int flags;
 255 {
 256
 257         panic("softdep_setup_freeblocks called");
 258 }
 259
 260 void
 261 softdep_freefile(pvp, ino, mode)
 262                 struct vnode *pvp;
 263                 ino_t ino;
 264                 int mode;
 265 {
 266
 267         panic("softdep_freefile called");
 268 }
 269
 270 int
 271 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 272         struct buf *bp;
 273         struct inode *dp;
 274         off_t diroffset;
 275         ino_t newinum;
 276         struct buf *newdirbp;
 277         int isnewblk;
 278 {
 279
 280         panic("softdep_setup_directory_add called");
 281 }
 282
 283 void
 284 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
 285         struct buf *bp;
 286         struct inode *dp;
 287         caddr_t base;
 288         caddr_t oldloc;
 289         caddr_t newloc;
 290         int entrysize;
 291 {
 292
 293         panic("softdep_change_directoryentry_offset called");
 294 }
 295
 296 void
 297 softdep_setup_remove(bp, dp, ip, isrmdir)
 298         struct buf *bp;
 299         struct inode *dp;
 300         struct inode *ip;
 301         int isrmdir;
 302 {
 303
 304         panic("softdep_setup_remove called");
 305 }
 306
 307 void
 308 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 309         struct buf *bp;
 310         struct inode *dp;
 311         struct inode *ip;
 312         ino_t newinum;
 313         int isrmdir;
 314 {
 315
 316         panic("softdep_setup_directory_change called");
 317 }
 318
 319 void
 320 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
 321         struct mount *mp;
 322         struct buf *bp;
 323         ufs2_daddr_t blkno;
 324         int frags;
 325         struct workhead *wkhd;
 326 {
 327
 328         panic("%s called", __FUNCTION__);
 329 }
 330
 331 void
 332 softdep_setup_inofree(mp, bp, ino, wkhd)
 333         struct mount *mp;
 334         struct buf *bp;
 335         ino_t ino;
 336         struct workhead *wkhd;
 337 {
 338
 339         panic("%s called", __FUNCTION__);
 340 }
 341
 342 void
 343 softdep_setup_unlink(dp, ip)
 344         struct inode *dp;
 345         struct inode *ip;
 346 {
 347
 348         panic("%s called", __FUNCTION__);
 349 }
 350
 351 void
 352 softdep_setup_link(dp, ip)
 353         struct inode *dp;
 354         struct inode *ip;
 355 {
 356
 357         panic("%s called", __FUNCTION__);
 358 }
 359
 360 void
 361 softdep_revert_link(dp, ip)
 362         struct inode *dp;
 363         struct inode *ip;
 364 {
 365
 366         panic("%s called", __FUNCTION__);
 367 }
 368
 369 void
 370 softdep_setup_rmdir(dp, ip)
 371         struct inode *dp;
 372         struct inode *ip;
 373 {
 374
 375         panic("%s called", __FUNCTION__);
 376 }
 377
 378 void
 379 softdep_revert_rmdir(dp, ip)
 380         struct inode *dp;
 381         struct inode *ip;
 382 {
 383
 384         panic("%s called", __FUNCTION__);
 385 }
 386
 387 void
 388 softdep_setup_create(dp, ip)
 389         struct inode *dp;
 390         struct inode *ip;
 391 {
 392
 393         panic("%s called", __FUNCTION__);
 394 }
 395
 396 void
 397 softdep_revert_create(dp, ip)
 398         struct inode *dp;
 399         struct inode *ip;
 400 {
 401
 402         panic("%s called", __FUNCTION__);
 403 }
 404
 405 void
 406 softdep_setup_mkdir(dp, ip)
 407         struct inode *dp;
 408         struct inode *ip;
 409 {
 410
 411         panic("%s called", __FUNCTION__);
 412 }
 413
 414 void
 415 softdep_revert_mkdir(dp, ip)
 416         struct inode *dp;
 417         struct inode *ip;
 418 {
 419
 420         panic("%s called", __FUNCTION__);
 421 }
 422
 423 void
 424 softdep_setup_dotdot_link(dp, ip)
 425         struct inode *dp;
 426         struct inode *ip;
 427 {
 428
 429         panic("%s called", __FUNCTION__);
 430 }
 431
 432 int
 433 softdep_prealloc(vp, waitok)
 434         struct vnode *vp;
 435         int waitok;
 436 {
 437
 438         panic("%s called", __FUNCTION__);
 439 }
 440
 441 int
 442 softdep_journal_lookup(mp, vpp)
 443         struct mount *mp;
 444         struct vnode **vpp;
 445 {
 446
 447         return (ENOENT);
 448 }
 449
 450 void
 451 softdep_change_linkcnt(ip)
 452         struct inode *ip;
 453 {
 454
 455         panic("softdep_change_linkcnt called");
 456 }
 457
 458 void
 459 softdep_load_inodeblock(ip)
 460         struct inode *ip;
 461 {
 462
 463         panic("softdep_load_inodeblock called");
 464 }
 465
 466 void
 467 softdep_update_inodeblock(ip, bp, waitfor)
 468         struct inode *ip;
 469         struct buf *bp;
 470         int waitfor;
 471 {
 472
 473         panic("softdep_update_inodeblock called");
 474 }
 475
 476 int
 477 softdep_fsync(vp)
 478         struct vnode *vp;       /* the "in_core" copy of the inode */
 479 {
 480
 481         return (0);
 482 }
 483
 484 void
 485 softdep_fsync_mountdev(vp)
 486         struct vnode *vp;
 487 {
 488
 489         return;
 490 }
 491
 492 int
 493 softdep_flushworklist(oldmnt, countp, td)
 494         struct mount *oldmnt;
 495         int *countp;
 496         struct thread *td;
 497 {
 498
 499         *countp = 0;
 500         return (0);
 501 }
 502
 503 int
 504 softdep_sync_metadata(struct vnode *vp)
 505 {
 506
 507         panic("softdep_sync_metadata called");
 508 }
 509
 510 int
 511 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 512 {
 513
 514         panic("softdep_sync_buf called");
 515 }
 516
 517 int
 518 softdep_slowdown(vp)
 519         struct vnode *vp;
 520 {
 521
 522         panic("softdep_slowdown called");
 523 }
 524
 525 int
 526 softdep_request_cleanup(fs, vp, cred, resource)
 527         struct fs *fs;
 528         struct vnode *vp;
 529         struct ucred *cred;
 530         int resource;
 531 {
 532
 533         return (0);
 534 }
 535
 536 int
 537 softdep_check_suspend(struct mount *mp,
 538                       struct vnode *devvp,
 539                       int softdep_depcnt,
 540                       int softdep_accdepcnt,
 541                       int secondary_writes,
 542                       int secondary_accwrites)
 543 {
 544         struct bufobj *bo;
 545         int error;
 546
 547         (void) softdep_depcnt,
 548         (void) softdep_accdepcnt;
 549
 550         bo = &devvp->v_bufobj;
 551         ASSERT_BO_WLOCKED(bo);
 552
 553         MNT_ILOCK(mp);
 554         while (mp->mnt_secondary_writes != 0) {
 555                 BO_UNLOCK(bo);
 556                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 557                     (PUSER - 1) | PDROP, "secwr", 0);
 558                 BO_LOCK(bo);
 559                 MNT_ILOCK(mp);
 560         }
 561
 562         /*
 563          * Reasons for needing more work before suspend:
 564          * - Dirty buffers on devvp.
 565          * - Secondary writes occurred after start of vnode sync loop
 566          */
 567         error = 0;
 568         if (bo->bo_numoutput > 0 ||
 569             bo->bo_dirty.bv_cnt > 0 ||
 570             secondary_writes != 0 ||
 571             mp->mnt_secondary_writes != 0 ||
 572             secondary_accwrites != mp->mnt_secondary_accwrites)
 573                 error = EAGAIN;
 574         BO_UNLOCK(bo);
 575         return (error);
 576 }
 577
 578 void
 579 softdep_get_depcounts(struct mount *mp,
 580                       int *softdepactivep,
 581                       int *softdepactiveaccp)
 582 {
 583         (void) mp;
 584         *softdepactivep = 0;
 585         *softdepactiveaccp = 0;
 586 }
 587
 588 void
 589 softdep_buf_append(bp, wkhd)
 590         struct buf *bp;
 591         struct workhead *wkhd;
 592 {
 593
 594         panic("softdep_buf_appendwork called");
 595 }
 596
 597 void
 598 softdep_inode_append(ip, cred, wkhd)
 599         struct inode *ip;
 600         struct ucred *cred;
 601         struct workhead *wkhd;
 602 {
 603
 604         panic("softdep_inode_appendwork called");
 605 }
 606
 607 void
 608 softdep_freework(wkhd)
 609         struct workhead *wkhd;
 610 {
 611
 612         panic("softdep_freework called");
 613 }
 614
 615 #else
 616
 617 FEATURE(softupdates, "FFS soft-updates support");
 618
 619 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
 620     "soft updates stats");
 621 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
 622     "total dependencies allocated");
 623 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
 624     "high use dependencies allocated");
 625 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
 626     "current dependencies allocated");
 627 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
 628     "current dependencies written");
 629
 630 unsigned long dep_current[D_LAST + 1];
 631 unsigned long dep_highuse[D_LAST + 1];
 632 unsigned long dep_total[D_LAST + 1];
 633 unsigned long dep_write[D_LAST + 1];
 634
 635 #define SOFTDEP_TYPE(type, str, long)                                   \
 636     static MALLOC_DEFINE(M_ ## type, #str, long);                       \
 637     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
 638         &dep_total[D_ ## type], 0, "");                                 \
 639     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
 640         &dep_current[D_ ## type], 0, "");                               \
 641     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD,     \
 642         &dep_highuse[D_ ## type], 0, "");                               \
 643     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,       \
 644         &dep_write[D_ ## type], 0, "");
 645
 646 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
 647 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
 648 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
 649     "Block or frag allocated from cyl group map");
 650 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
 651 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
 652 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
 653 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
 654 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
 655 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
 656 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
 657 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
 658 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
 659 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
 660 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
 661 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
 662 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
 663 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
 664 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
 665 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
 666 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
 667 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
 668 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
 669 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
 670 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
 671 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
 672 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
 673 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
 674
 675 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
 676
 677 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
 678 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
 679 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
 680
 681 #define M_SOFTDEP_FLAGS (M_WAITOK)
 682
 683 /*
 684  * translate from workitem type to memory type
 685  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 686  */
 687 static struct malloc_type *memtype[] = {
 688         M_PAGEDEP,
 689         M_INODEDEP,
 690         M_BMSAFEMAP,
 691         M_NEWBLK,
 692         M_ALLOCDIRECT,
 693         M_INDIRDEP,
 694         M_ALLOCINDIR,
 695         M_FREEFRAG,
 696         M_FREEBLKS,
 697         M_FREEFILE,
 698         M_DIRADD,
 699         M_MKDIR,
 700         M_DIRREM,
 701         M_NEWDIRBLK,
 702         M_FREEWORK,
 703         M_FREEDEP,
 704         M_JADDREF,
 705         M_JREMREF,
 706         M_JMVREF,
 707         M_JNEWBLK,
 708         M_JFREEBLK,
 709         M_JFREEFRAG,
 710         M_JSEG,
 711         M_JSEGDEP,
 712         M_SBDEP,
 713         M_JTRUNC,
 714         M_JFSYNC,
 715         M_SENTINEL
 716 };
 717
 718 #define DtoM(type) (memtype[type])
 719
 720 /*
 721  * Names of malloc types.
 722  */
 723 #define TYPENAME(type)  \
 724         ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
 725 /*
 726  * End system adaptation definitions.
 727  */
 728
 729 #define DOTDOT_OFFSET   offsetof(struct dirtemplate, dotdot_ino)
 730 #define DOT_OFFSET      offsetof(struct dirtemplate, dot_ino)
 731
 732 /*
 733  * Internal function prototypes.
 734  */
 735 static  void check_clear_deps(struct mount *);
 736 static  void softdep_error(char *, int);
 737 static  int softdep_process_worklist(struct mount *, int);
 738 static  int softdep_waitidle(struct mount *, int);
 739 static  void drain_output(struct vnode *);
 740 static  struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
 741 static  int check_inodedep_free(struct inodedep *);
 742 static  void clear_remove(struct mount *);
 743 static  void clear_inodedeps(struct mount *);
 744 static  void unlinked_inodedep(struct mount *, struct inodedep *);
 745 static  void clear_unlinked_inodedep(struct inodedep *);
 746 static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 747 static  int flush_pagedep_deps(struct vnode *, struct mount *,
 748             struct diraddhd *);
 749 static  int free_pagedep(struct pagedep *);
 750 static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 751 static  int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 752 static  int flush_deplist(struct allocdirectlst *, int, int *);
 753 static  int sync_cgs(struct mount *, int);
 754 static  int handle_written_filepage(struct pagedep *, struct buf *, int);
 755 static  int handle_written_sbdep(struct sbdep *, struct buf *);
 756 static  void initiate_write_sbdep(struct sbdep *);
 757 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 758 static  int handle_written_indirdep(struct indirdep *, struct buf *,
 759             struct buf**, int);
 760 static  int handle_written_inodeblock(struct inodedep *, struct buf *, int);
 761 static  int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 762             uint8_t *);
 763 static  int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
 764 static  void handle_written_jaddref(struct jaddref *);
 765 static  void handle_written_jremref(struct jremref *);
 766 static  void handle_written_jseg(struct jseg *, struct buf *);
 767 static  void handle_written_jnewblk(struct jnewblk *);
 768 static  void handle_written_jblkdep(struct jblkdep *);
 769 static  void handle_written_jfreefrag(struct jfreefrag *);
 770 static  void complete_jseg(struct jseg *);
 771 static  void complete_jsegs(struct jseg *);
 772 static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
 773 static  void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
 774 static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
 775 static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
 776 static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
 777 static  void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
 778 static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
 779 static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
 780 static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
 781 static  inline void inoref_write(struct inoref *, struct jseg *,
 782             struct jrefrec *);
 783 static  void handle_allocdirect_partdone(struct allocdirect *,
 784             struct workhead *);
 785 static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
 786             struct workhead *);
 787 static  void indirdep_complete(struct indirdep *);
 788 static  int indirblk_lookup(struct mount *, ufs2_daddr_t);
 789 static  void indirblk_insert(struct freework *);
 790 static  void indirblk_remove(struct freework *);
 791 static  void handle_allocindir_partdone(struct allocindir *);
 792 static  void initiate_write_filepage(struct pagedep *, struct buf *);
 793 static  void initiate_write_indirdep(struct indirdep*, struct buf *);
 794 static  void handle_written_mkdir(struct mkdir *, int);
 795 static  int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
 796             uint8_t *);
 797 static  void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 798 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 799 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 800 static  void handle_workitem_freefile(struct freefile *);
 801 static  int handle_workitem_remove(struct dirrem *, int);
 802 static  struct dirrem *newdirrem(struct buf *, struct inode *,
 803             struct inode *, int, struct dirrem **);
 804 static  struct indirdep *indirdep_lookup(struct mount *, struct inode *,
 805             struct buf *);
 806 static  void cancel_indirdep(struct indirdep *, struct buf *,
 807             struct freeblks *);
 808 static  void free_indirdep(struct indirdep *);
 809 static  void free_diradd(struct diradd *, struct workhead *);
 810 static  void merge_diradd(struct inodedep *, struct diradd *);
 811 static  void complete_diradd(struct diradd *);
 812 static  struct diradd *diradd_lookup(struct pagedep *, int);
 813 static  struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
 814             struct jremref *);
 815 static  struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
 816             struct jremref *);
 817 static  void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
 818             struct jremref *, struct jremref *);
 819 static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
 820             struct jremref *);
 821 static  void cancel_allocindir(struct allocindir *, struct buf *bp,
 822             struct freeblks *, int);
 823 static  int setup_trunc_indir(struct freeblks *, struct inode *,
 824             ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
 825 static  void complete_trunc_indir(struct freework *);
 826 static  void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
 827             int);
 828 static  void complete_mkdir(struct mkdir *);
 829 static  void free_newdirblk(struct newdirblk *);
 830 static  void free_jremref(struct jremref *);
 831 static  void free_jaddref(struct jaddref *);
 832 static  void free_jsegdep(struct jsegdep *);
 833 static  void free_jsegs(struct jblocks *);
 834 static  void rele_jseg(struct jseg *);
 835 static  void free_jseg(struct jseg *, struct jblocks *);
 836 static  void free_jnewblk(struct jnewblk *);
 837 static  void free_jblkdep(struct jblkdep *);
 838 static  void free_jfreefrag(struct jfreefrag *);
 839 static  void free_freedep(struct freedep *);
 840 static  void journal_jremref(struct dirrem *, struct jremref *,
 841             struct inodedep *);
 842 static  void cancel_jnewblk(struct jnewblk *, struct workhead *);
 843 static  int cancel_jaddref(struct jaddref *, struct inodedep *,
 844             struct workhead *);
 845 static  void cancel_jfreefrag(struct jfreefrag *);
 846 static  inline void setup_freedirect(struct freeblks *, struct inode *,
 847             int, int);
 848 static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
 849 static  inline void setup_freeindir(struct freeblks *, struct inode *, int,
 850             ufs_lbn_t, int);
 851 static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
 852 static  void freeblks_free(struct ufsmount *, struct freeblks *, int);
 853 static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
 854 static  ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
 855 static  int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
 856 static  void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
 857             int, int);
 858 static  void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
 859 static  int cancel_pagedep(struct pagedep *, struct freeblks *, int);
 860 static  int deallocate_dependencies(struct buf *, struct freeblks *, int);
 861 static  void newblk_freefrag(struct newblk*);
 862 static  void free_newblk(struct newblk *);
 863 static  void cancel_allocdirect(struct allocdirectlst *,
 864             struct allocdirect *, struct freeblks *);
 865 static  int check_inode_unwritten(struct inodedep *);
 866 static  int free_inodedep(struct inodedep *);
 867 static  void freework_freeblock(struct freework *);
 868 static  void freework_enqueue(struct freework *);
 869 static  int handle_workitem_freeblocks(struct freeblks *, int);
 870 static  int handle_complete_freeblocks(struct freeblks *, int);
 871 static  void handle_workitem_indirblk(struct freework *);
 872 static  void handle_written_freework(struct freework *);
 873 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 874 static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
 875             struct workhead *);
 876 static  struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
 877             struct inodedep *, struct allocindir *, ufs_lbn_t);
 878 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 879             ufs2_daddr_t, ufs_lbn_t);
 880 static  void handle_workitem_freefrag(struct freefrag *);
 881 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
 882             ufs_lbn_t);
 883 static  void allocdirect_merge(struct allocdirectlst *,
 884             struct allocdirect *, struct allocdirect *);
 885 static  struct freefrag *allocindir_merge(struct allocindir *,
 886             struct allocindir *);
 887 static  int bmsafemap_find(struct bmsafemap_hashhead *, int,
 888             struct bmsafemap **);
 889 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
 890             int cg, struct bmsafemap *);
 891 static  int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
 892             struct newblk **);
 893 static  int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
 894 static  int inodedep_find(struct inodedep_hashhead *, ino_t,
 895             struct inodedep **);
 896 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 897 static  int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
 898             int, struct pagedep **);
 899 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 900             struct pagedep **);
 901 static  void pause_timer(void *);
 902 static  int request_cleanup(struct mount *, int);
 903 static  void schedule_cleanup(struct mount *);
 904 static void softdep_ast_cleanup_proc(void);
 905 static  int process_worklist_item(struct mount *, int, int);
 906 static  void process_removes(struct vnode *);
 907 static  void process_truncates(struct vnode *);
 908 static  void jwork_move(struct workhead *, struct workhead *);
 909 static  void jwork_insert(struct workhead *, struct jsegdep *);
 910 static  void add_to_worklist(struct worklist *, int);
 911 static  void wake_worklist(struct worklist *);
 912 static  void wait_worklist(struct worklist *, char *);
 913 static  void remove_from_worklist(struct worklist *);
 914 static  void softdep_flush(void *);
 915 static  void softdep_flushjournal(struct mount *);
 916 static  int softdep_speedup(struct ufsmount *);
 917 static  void worklist_speedup(struct mount *);
 918 static  int journal_mount(struct mount *, struct fs *, struct ucred *);
 919 static  void journal_unmount(struct ufsmount *);
 920 static  int journal_space(struct ufsmount *, int);
 921 static  void journal_suspend(struct ufsmount *);
 922 static  int journal_unsuspend(struct ufsmount *ump);
 923 static  void softdep_prelink(struct vnode *, struct vnode *);
 924 static  void add_to_journal(struct worklist *);
 925 static  void remove_from_journal(struct worklist *);
 926 static  bool softdep_excess_items(struct ufsmount *, int);
 927 static  void softdep_process_journal(struct mount *, struct worklist *, int);
 928 static  struct jremref *newjremref(struct dirrem *, struct inode *,
 929             struct inode *ip, off_t, nlink_t);
 930 static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
 931             uint16_t);
 932 static  inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
 933             uint16_t);
 934 static  inline struct jsegdep *inoref_jseg(struct inoref *);
 935 static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
 936 static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
 937             ufs2_daddr_t, int);
 938 static  void adjust_newfreework(struct freeblks *, int);
 939 static  struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
 940 static  void move_newblock_dep(struct jaddref *, struct inodedep *);
 941 static  void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
 942 static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
 943             ufs2_daddr_t, long, ufs_lbn_t);
 944 static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
 945             struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
 946 static  int jwait(struct worklist *, int);
 947 static  struct inodedep *inodedep_lookup_ip(struct inode *);
 948 static  int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
 949 static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
 950 static  void handle_jwork(struct workhead *);
 951 static  struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
 952             struct mkdir **);
 953 static  struct jblocks *jblocks_create(void);
 954 static  ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
 955 static  void jblocks_free(struct jblocks *, struct mount *, int);
 956 static  void jblocks_destroy(struct jblocks *);
 957 static  void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
 958
 959 /*
 960  * Exported softdep operations.
 961  */
 962 static  void softdep_disk_io_initiation(struct buf *);
 963 static  void softdep_disk_write_complete(struct buf *);
 964 static  void softdep_deallocate_dependencies(struct buf *);
 965 static  int softdep_count_dependencies(struct buf *bp, int);
 966
 967 /*
 968  * Global lock over all of soft updates.
 969  */
 970 static struct mtx lk;
 971 MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
 972
 973 #define ACQUIRE_GBLLOCK(lk)     mtx_lock(lk)
 974 #define FREE_GBLLOCK(lk)        mtx_unlock(lk)
 975 #define GBLLOCK_OWNED(lk)       mtx_assert((lk), MA_OWNED)
 976
 977 /*
 978  * Per-filesystem soft-updates locking.
 979  */
 980 #define LOCK_PTR(ump)           (&(ump)->um_softdep->sd_fslock)
 981 #define TRY_ACQUIRE_LOCK(ump)   rw_try_wlock(&(ump)->um_softdep->sd_fslock)
 982 #define ACQUIRE_LOCK(ump)       rw_wlock(&(ump)->um_softdep->sd_fslock)
 983 #define FREE_LOCK(ump)          rw_wunlock(&(ump)->um_softdep->sd_fslock)
 984 #define LOCK_OWNED(ump)         rw_assert(&(ump)->um_softdep->sd_fslock, \
 985                                     RA_WLOCKED)
 986
 987 #define BUF_AREC(bp)            lockallowrecurse(&(bp)->b_lock)
 988 #define BUF_NOREC(bp)           lockdisablerecurse(&(bp)->b_lock)
 989
 990 /*
 991  * Worklist queue management.
 992  * These routines require that the lock be held.
 993  */
 994 #ifndef /* NOT */ DEBUG
 995 #define WORKLIST_INSERT(head, item) do {        \
 996         (item)->wk_state |= ONWORKLIST;         \
 997         LIST_INSERT_HEAD(head, item, wk_list);  \
 998 } while (0)
 999 #define WORKLIST_REMOVE(item) do {              \
1000         (item)->wk_state &= ~ONWORKLIST;        \
1001         LIST_REMOVE(item, wk_list);             \
1002 } while (0)
1003 #define WORKLIST_INSERT_UNLOCKED        WORKLIST_INSERT
1004 #define WORKLIST_REMOVE_UNLOCKED        WORKLIST_REMOVE
1005
1006 #else /* DEBUG */
1007 static  void worklist_insert(struct workhead *, struct worklist *, int);
1008 static  void worklist_remove(struct worklist *, int);
1009
1010 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1011 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1012 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1013 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1014
1015 static void
1016 worklist_insert(head, item, locked)
1017         struct workhead *head;
1018         struct worklist *item;
1019         int locked;
1020 {
1021
1022         if (locked)
1023                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1024         if (item->wk_state & ONWORKLIST)
1025                 panic("worklist_insert: %p %s(0x%X) already on list",
1026                     item, TYPENAME(item->wk_type), item->wk_state);
1027         item->wk_state |= ONWORKLIST;
1028         LIST_INSERT_HEAD(head, item, wk_list);
1029 }
1030
1031 static void
1032 worklist_remove(item, locked)
1033         struct worklist *item;
1034         int locked;
1035 {
1036
1037         if (locked)
1038                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1039         if ((item->wk_state & ONWORKLIST) == 0)
1040                 panic("worklist_remove: %p %s(0x%X) not on list",
1041                     item, TYPENAME(item->wk_type), item->wk_state);
1042         item->wk_state &= ~ONWORKLIST;
1043         LIST_REMOVE(item, wk_list);
1044 }
1045 #endif /* DEBUG */
1046
1047 /*
1048  * Merge two jsegdeps keeping only the oldest one as newer references
1049  * can't be discarded until after older references.
1050  */
1051 static inline struct jsegdep *
1052 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1053 {
1054         struct jsegdep *swp;
1055
1056         if (two == NULL)
1057                 return (one);
1058
1059         if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1060                 swp = one;
1061                 one = two;
1062                 two = swp;
1063         }
1064         WORKLIST_REMOVE(&two->jd_list);
1065         free_jsegdep(two);
1066
1067         return (one);
1068 }
1069
1070 /*
1071  * If two freedeps are compatible free one to reduce list size.
1072  */
1073 static inline struct freedep *
1074 freedep_merge(struct freedep *one, struct freedep *two)
1075 {
1076         if (two == NULL)
1077                 return (one);
1078
1079         if (one->fd_freework == two->fd_freework) {
1080                 WORKLIST_REMOVE(&two->fd_list);
1081                 free_freedep(two);
1082         }
1083         return (one);
1084 }
1085
1086 /*
1087  * Move journal work from one list to another.  Duplicate freedeps and
1088  * jsegdeps are coalesced to keep the lists as small as possible.
1089  */
1090 static void
1091 jwork_move(dst, src)
1092         struct workhead *dst;
1093         struct workhead *src;
1094 {
1095         struct freedep *freedep;
1096         struct jsegdep *jsegdep;
1097         struct worklist *wkn;
1098         struct worklist *wk;
1099
1100         KASSERT(dst != src,
1101             ("jwork_move: dst == src"));
1102         freedep = NULL;
1103         jsegdep = NULL;
1104         LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1105                 if (wk->wk_type == D_JSEGDEP)
1106                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1107                 if (wk->wk_type == D_FREEDEP)
1108                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1109         }
1110
1111         while ((wk = LIST_FIRST(src)) != NULL) {
1112                 WORKLIST_REMOVE(wk);
1113                 WORKLIST_INSERT(dst, wk);
1114                 if (wk->wk_type == D_JSEGDEP) {
1115                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1116                         continue;
1117                 }
1118                 if (wk->wk_type == D_FREEDEP)
1119                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1120         }
1121 }
1122
1123 static void
1124 jwork_insert(dst, jsegdep)
1125         struct workhead *dst;
1126         struct jsegdep *jsegdep;
1127 {
1128         struct jsegdep *jsegdepn;
1129         struct worklist *wk;
1130
1131         LIST_FOREACH(wk, dst, wk_list)
1132                 if (wk->wk_type == D_JSEGDEP)
1133                         break;
1134         if (wk == NULL) {
1135                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1136                 return;
1137         }
1138         jsegdepn = WK_JSEGDEP(wk);
1139         if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1140                 WORKLIST_REMOVE(wk);
1141                 free_jsegdep(jsegdepn);
1142                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1143         } else
1144                 free_jsegdep(jsegdep);
1145 }
1146
1147 /*
1148  * Routines for tracking and managing workitems.
1149  */
1150 static  void workitem_free(struct worklist *, int);
1151 static  void workitem_alloc(struct worklist *, int, struct mount *);
1152 static  void workitem_reassign(struct worklist *, int);
1153
1154 #define WORKITEM_FREE(item, type) \
1155         workitem_free((struct worklist *)(item), (type))
1156 #define WORKITEM_REASSIGN(item, type) \
1157         workitem_reassign((struct worklist *)(item), (type))
1158
1159 static void
1160 workitem_free(item, type)
1161         struct worklist *item;
1162         int type;
1163 {
1164         struct ufsmount *ump;
1165
1166 #ifdef DEBUG
1167         if (item->wk_state & ONWORKLIST)
1168                 panic("workitem_free: %s(0x%X) still on list",
1169                     TYPENAME(item->wk_type), item->wk_state);
1170         if (item->wk_type != type && type != D_NEWBLK)
1171                 panic("workitem_free: type mismatch %s != %s",
1172                     TYPENAME(item->wk_type), TYPENAME(type));
1173 #endif
1174         if (item->wk_state & IOWAITING)
1175                 wakeup(item);
1176         ump = VFSTOUFS(item->wk_mp);
1177         LOCK_OWNED(ump);
1178         KASSERT(ump->softdep_deps > 0,
1179             ("workitem_free: %s: softdep_deps going negative",
1180             ump->um_fs->fs_fsmnt));
1181         if (--ump->softdep_deps == 0 && ump->softdep_req)
1182                 wakeup(&ump->softdep_deps);
1183         KASSERT(dep_current[item->wk_type] > 0,
1184             ("workitem_free: %s: dep_current[%s] going negative",
1185             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1186         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1187             ("workitem_free: %s: softdep_curdeps[%s] going negative",
1188             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1189         atomic_subtract_long(&dep_current[item->wk_type], 1);
1190         ump->softdep_curdeps[item->wk_type] -= 1;
1191         free(item, DtoM(type));
1192 }
1193
1194 static void
1195 workitem_alloc(item, type, mp)
1196         struct worklist *item;
1197         int type;
1198         struct mount *mp;
1199 {
1200         struct ufsmount *ump;
1201
1202         item->wk_type = type;
1203         item->wk_mp = mp;
1204         item->wk_state = 0;
1205
1206         ump = VFSTOUFS(mp);
1207         ACQUIRE_GBLLOCK(&lk);
1208         dep_current[type]++;
1209         if (dep_current[type] > dep_highuse[type])
1210                 dep_highuse[type] = dep_current[type];
1211         dep_total[type]++;
1212         FREE_GBLLOCK(&lk);
1213         ACQUIRE_LOCK(ump);
1214         ump->softdep_curdeps[type] += 1;
1215         ump->softdep_deps++;
1216         ump->softdep_accdeps++;
1217         FREE_LOCK(ump);
1218 }
1219
1220 static void
1221 workitem_reassign(item, newtype)
1222         struct worklist *item;
1223         int newtype;
1224 {
1225         struct ufsmount *ump;
1226
1227         ump = VFSTOUFS(item->wk_mp);
1228         LOCK_OWNED(ump);
1229         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1230             ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1231             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1232         ump->softdep_curdeps[item->wk_type] -= 1;
1233         ump->softdep_curdeps[newtype] += 1;
1234         KASSERT(dep_current[item->wk_type] > 0,
1235             ("workitem_reassign: %s: dep_current[%s] going negative",
1236             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1237         ACQUIRE_GBLLOCK(&lk);
1238         dep_current[newtype]++;
1239         dep_current[item->wk_type]--;
1240         if (dep_current[newtype] > dep_highuse[newtype])
1241                 dep_highuse[newtype] = dep_current[newtype];
1242         dep_total[newtype]++;
1243         FREE_GBLLOCK(&lk);
1244         item->wk_type = newtype;
1245 }
1246
1247 /*
1248  * Workitem queue management
1249  */
1250 static int max_softdeps;        /* maximum number of structs before slowdown */
1251 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
1252 static int proc_waiting;        /* tracks whether we have a timeout posted */
1253 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
1254 static struct callout softdep_callout;
1255 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
1256 static int req_clear_remove;    /* syncer process flush some freeblks */
1257 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1258
1259 /*
1260  * runtime statistics
1261  */
1262 static int stat_flush_threads;  /* number of softdep flushing threads */
1263 static int stat_worklist_push;  /* number of worklist cleanups */
1264 static int stat_blk_limit_push; /* number of times block limit neared */
1265 static int stat_ino_limit_push; /* number of times inode limit neared */
1266 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
1267 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
1268 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
1269 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
1270 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
1271 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1272 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
1273 static int stat_jaddref;        /* bufs redirtied as ino bitmap can not write */
1274 static int stat_jnewblk;        /* bufs redirtied as blk bitmap can not write */
1275 static int stat_journal_min;    /* Times hit journal min threshold */
1276 static int stat_journal_low;    /* Times hit journal low threshold */
1277 static int stat_journal_wait;   /* Times blocked in jwait(). */
1278 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
1279 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
1280 static int stat_jwait_inode;    /* Times blocked in jwait() for inodes. */
1281 static int stat_jwait_newblk;   /* Times blocked in jwait() for newblks. */
1282 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1283 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1284 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1285 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1286 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1287 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1288
1289 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1290     &max_softdeps, 0, "");
1291 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1292     &tickdelay, 0, "");
1293 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1294     &stat_flush_threads, 0, "");
1295 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1296     &stat_worklist_push, 0,"");
1297 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1298     &stat_blk_limit_push, 0,"");
1299 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1300     &stat_ino_limit_push, 0,"");
1301 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1302     &stat_blk_limit_hit, 0, "");
1303 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1304     &stat_ino_limit_hit, 0, "");
1305 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1306     &stat_sync_limit_hit, 0, "");
1307 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1308     &stat_indir_blk_ptrs, 0, "");
1309 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1310     &stat_inode_bitmap, 0, "");
1311 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1312     &stat_direct_blk_ptrs, 0, "");
1313 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1314     &stat_dir_entry, 0, "");
1315 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1316     &stat_jaddref, 0, "");
1317 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1318     &stat_jnewblk, 0, "");
1319 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1320     &stat_journal_low, 0, "");
1321 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1322     &stat_journal_min, 0, "");
1323 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1324     &stat_journal_wait, 0, "");
1325 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1326     &stat_jwait_filepage, 0, "");
1327 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1328     &stat_jwait_freeblks, 0, "");
1329 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1330     &stat_jwait_inode, 0, "");
1331 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1332     &stat_jwait_newblk, 0, "");
1333 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1334     &stat_cleanup_blkrequests, 0, "");
1335 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1336     &stat_cleanup_inorequests, 0, "");
1337 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1338     &stat_cleanup_high_delay, 0, "");
1339 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1340     &stat_cleanup_retries, 0, "");
1341 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1342     &stat_cleanup_failures, 0, "");
1343 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1344     &softdep_flushcache, 0, "");
1345 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1346     &stat_emptyjblocks, 0, "");
1347
1348 SYSCTL_DECL(_vfs_ffs);
1349
1350 /* Whether to recompute the summary at mount time */
1351 static int compute_summary_at_mount = 0;
1352 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1353            &compute_summary_at_mount, 0, "Recompute summary at mount");
1354 static int print_threads = 0;
1355 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1356     &print_threads, 0, "Notify flusher thread start/stop");
1357
1358 /* List of all filesystems mounted with soft updates */
1359 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1360
1361 /*
1362  * This function cleans the worklist for a filesystem.
1363  * Each filesystem running with soft dependencies gets its own
1364  * thread to run in this function. The thread is started up in
1365  * softdep_mount and shutdown in softdep_unmount. They show up
1366  * as part of the kernel "bufdaemon" process whose process
1367  * entry is available in bufdaemonproc.
1368  */
1369 static int searchfailed;
1370 extern struct proc *bufdaemonproc;
1371 static void
1372 softdep_flush(addr)
1373         void *addr;
1374 {
1375         struct mount *mp;
1376         struct thread *td;
1377         struct ufsmount *ump;
1378
1379         td = curthread;
1380         td->td_pflags |= TDP_NORUNNINGBUF;
1381         mp = (struct mount *)addr;
1382         ump = VFSTOUFS(mp);
1383         atomic_add_int(&stat_flush_threads, 1);
1384         ACQUIRE_LOCK(ump);
1385         ump->softdep_flags &= ~FLUSH_STARTING;
1386         wakeup(&ump->softdep_flushtd);
1387         FREE_LOCK(ump);
1388         if (print_threads) {
1389                 if (stat_flush_threads == 1)
1390                         printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1391                             bufdaemonproc->p_pid);
1392                 printf("Start thread %s\n", td->td_name);
1393         }
1394         for (;;) {
1395                 while (softdep_process_worklist(mp, 0) > 0 ||
1396                     (MOUNTEDSUJ(mp) &&
1397                     VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1398                         kthread_suspend_check();
1399                 ACQUIRE_LOCK(ump);
1400                 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1401                         msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1402                             "sdflush", hz / 2);
1403                 ump->softdep_flags &= ~FLUSH_CLEANUP;
1404                 /*
1405                  * Check to see if we are done and need to exit.
1406                  */
1407                 if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1408                         FREE_LOCK(ump);
1409                         continue;
1410                 }
1411                 ump->softdep_flags &= ~FLUSH_EXIT;
1412                 FREE_LOCK(ump);
1413                 wakeup(&ump->softdep_flags);
1414                 if (print_threads)
1415                         printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1416                 atomic_subtract_int(&stat_flush_threads, 1);
1417                 kthread_exit();
1418                 panic("kthread_exit failed\n");
1419         }
1420 }
1421
1422 static void
1423 worklist_speedup(mp)
1424         struct mount *mp;
1425 {
1426         struct ufsmount *ump;
1427
1428         ump = VFSTOUFS(mp);
1429         LOCK_OWNED(ump);
1430         if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1431                 ump->softdep_flags |= FLUSH_CLEANUP;
1432         wakeup(&ump->softdep_flushtd);
1433 }
1434
1435 static int
1436 softdep_speedup(ump)
1437         struct ufsmount *ump;
1438 {
1439         struct ufsmount *altump;
1440         struct mount_softdeps *sdp;
1441
1442         LOCK_OWNED(ump);
1443         worklist_speedup(ump->um_mountp);
1444         bd_speedup();
1445         /*
1446          * If we have global shortages, then we need other
1447          * filesystems to help with the cleanup. Here we wakeup a
1448          * flusher thread for a filesystem that is over its fair
1449          * share of resources.
1450          */
1451         if (req_clear_inodedeps || req_clear_remove) {
1452                 ACQUIRE_GBLLOCK(&lk);
1453                 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1454                         if ((altump = sdp->sd_ump) == ump)
1455                                 continue;
1456                         if (((req_clear_inodedeps &&
1457                             altump->softdep_curdeps[D_INODEDEP] >
1458                             max_softdeps / stat_flush_threads) ||
1459                             (req_clear_remove &&
1460                             altump->softdep_curdeps[D_DIRREM] >
1461                             (max_softdeps / 2) / stat_flush_threads)) &&
1462                             TRY_ACQUIRE_LOCK(altump))
1463                                 break;
1464                 }
1465                 if (sdp == NULL) {
1466                         searchfailed++;
1467                         FREE_GBLLOCK(&lk);
1468                 } else {
1469                         /*
1470                          * Move to the end of the list so we pick a
1471                          * different one on out next try.
1472                          */
1473                         TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1474                         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1475                         FREE_GBLLOCK(&lk);
1476                         if ((altump->softdep_flags &
1477                             (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1478                                 altump->softdep_flags |= FLUSH_CLEANUP;
1479                         altump->um_softdep->sd_cleanups++;
1480                         wakeup(&altump->softdep_flushtd);
1481                         FREE_LOCK(altump);
1482                 }
1483         }
1484         return (speedup_syncer());
1485 }
1486
1487 /*
1488  * Add an item to the end of the work queue.
1489  * This routine requires that the lock be held.
1490  * This is the only routine that adds items to the list.
1491  * The following routine is the only one that removes items
1492  * and does so in order from first to last.
1493  */
1494
1495 #define WK_HEAD         0x0001  /* Add to HEAD. */
1496 #define WK_NODELAY      0x0002  /* Process immediately. */
1497
1498 static void
1499 add_to_worklist(wk, flags)
1500         struct worklist *wk;
1501         int flags;
1502 {
1503         struct ufsmount *ump;
1504
1505         ump = VFSTOUFS(wk->wk_mp);
1506         LOCK_OWNED(ump);
1507         if (wk->wk_state & ONWORKLIST)
1508                 panic("add_to_worklist: %s(0x%X) already on list",
1509                     TYPENAME(wk->wk_type), wk->wk_state);
1510         wk->wk_state |= ONWORKLIST;
1511         if (ump->softdep_on_worklist == 0) {
1512                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1513                 ump->softdep_worklist_tail = wk;
1514         } else if (flags & WK_HEAD) {
1515                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1516         } else {
1517                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1518                 ump->softdep_worklist_tail = wk;
1519         }
1520         ump->softdep_on_worklist += 1;
1521         if (flags & WK_NODELAY)
1522                 worklist_speedup(wk->wk_mp);
1523 }
1524
1525 /*
1526  * Remove the item to be processed. If we are removing the last
1527  * item on the list, we need to recalculate the tail pointer.
1528  */
1529 static void
1530 remove_from_worklist(wk)
1531         struct worklist *wk;
1532 {
1533         struct ufsmount *ump;
1534
1535         ump = VFSTOUFS(wk->wk_mp);
1536         WORKLIST_REMOVE(wk);
1537         if (ump->softdep_worklist_tail == wk)
1538                 ump->softdep_worklist_tail =
1539                     (struct worklist *)wk->wk_list.le_prev;
1540         ump->softdep_on_worklist -= 1;
1541 }
1542
1543 static void
1544 wake_worklist(wk)
1545         struct worklist *wk;
1546 {
1547         if (wk->wk_state & IOWAITING) {
1548                 wk->wk_state &= ~IOWAITING;
1549                 wakeup(wk);
1550         }
1551 }
1552
1553 static void
1554 wait_worklist(wk, wmesg)
1555         struct worklist *wk;
1556         char *wmesg;
1557 {
1558         struct ufsmount *ump;
1559
1560         ump = VFSTOUFS(wk->wk_mp);
1561         wk->wk_state |= IOWAITING;
1562         msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1563 }
1564
1565 /*
1566  * Process that runs once per second to handle items in the background queue.
1567  *
1568  * Note that we ensure that everything is done in the order in which they
1569  * appear in the queue. The code below depends on this property to ensure
1570  * that blocks of a file are freed before the inode itself is freed. This
1571  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1572  * until all the old ones have been purged from the dependency lists.
1573  */
1574 static int
1575 softdep_process_worklist(mp, full)
1576         struct mount *mp;
1577         int full;
1578 {
1579         int cnt, matchcnt;
1580         struct ufsmount *ump;
1581         long starttime;
1582
1583         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1584         if (MOUNTEDSOFTDEP(mp) == 0)
1585                 return (0);
1586         matchcnt = 0;
1587         ump = VFSTOUFS(mp);
1588         ACQUIRE_LOCK(ump);
1589         starttime = time_second;
1590         softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1591         check_clear_deps(mp);
1592         while (ump->softdep_on_worklist > 0) {
1593                 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1594                         break;
1595                 else
1596                         matchcnt += cnt;
1597                 check_clear_deps(mp);
1598                 /*
1599                  * We do not generally want to stop for buffer space, but if
1600                  * we are really being a buffer hog, we will stop and wait.
1601                  */
1602                 if (should_yield()) {
1603                         FREE_LOCK(ump);
1604                         kern_yield(PRI_USER);
1605                         bwillwrite();
1606                         ACQUIRE_LOCK(ump);
1607                 }
1608                 /*
1609                  * Never allow processing to run for more than one
1610                  * second. This gives the syncer thread the opportunity
1611                  * to pause if appropriate.
1612                  */
1613                 if (!full && starttime != time_second)
1614                         break;
1615         }
1616         if (full == 0)
1617                 journal_unsuspend(ump);
1618         FREE_LOCK(ump);
1619         return (matchcnt);
1620 }
1621
1622 /*
1623  * Process all removes associated with a vnode if we are running out of
1624  * journal space.  Any other process which attempts to flush these will
1625  * be unable as we have the vnodes locked.
1626  */
1627 static void
1628 process_removes(vp)
1629         struct vnode *vp;
1630 {
1631         struct inodedep *inodedep;
1632         struct dirrem *dirrem;
1633         struct ufsmount *ump;
1634         struct mount *mp;
1635         ino_t inum;
1636
1637         mp = vp->v_mount;
1638         ump = VFSTOUFS(mp);
1639         LOCK_OWNED(ump);
1640         inum = VTOI(vp)->i_number;
1641         for (;;) {
1642 top:
1643                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1644                         return;
1645                 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1646                         /*
1647                          * If another thread is trying to lock this vnode
1648                          * it will fail but we must wait for it to do so
1649                          * before we can proceed.
1650                          */
1651                         if (dirrem->dm_state & INPROGRESS) {
1652                                 wait_worklist(&dirrem->dm_list, "pwrwait");
1653                                 goto top;
1654                         }
1655                         if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1656                             (COMPLETE | ONWORKLIST))
1657                                 break;
1658                 }
1659                 if (dirrem == NULL)
1660                         return;
1661                 remove_from_worklist(&dirrem->dm_list);
1662                 FREE_LOCK(ump);
1663                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1664                         panic("process_removes: suspended filesystem");
1665                 handle_workitem_remove(dirrem, 0);
1666                 vn_finished_secondary_write(mp);
1667                 ACQUIRE_LOCK(ump);
1668         }
1669 }
1670
1671 /*
1672  * Process all truncations associated with a vnode if we are running out
1673  * of journal space.  This is called when the vnode lock is already held
1674  * and no other process can clear the truncation.  This function returns
1675  * a value greater than zero if it did any work.
1676  */
1677 static void
1678 process_truncates(vp)
1679         struct vnode *vp;
1680 {
1681         struct inodedep *inodedep;
1682         struct freeblks *freeblks;
1683         struct ufsmount *ump;
1684         struct mount *mp;
1685         ino_t inum;
1686         int cgwait;
1687
1688         mp = vp->v_mount;
1689         ump = VFSTOUFS(mp);
1690         LOCK_OWNED(ump);
1691         inum = VTOI(vp)->i_number;
1692         for (;;) {
1693                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1694                         return;
1695                 cgwait = 0;
1696                 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1697                         /* Journal entries not yet written.  */
1698                         if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1699                                 jwait(&LIST_FIRST(
1700                                     &freeblks->fb_jblkdephd)->jb_list,
1701                                     MNT_WAIT);
1702                                 break;
1703                         }
1704                         /* Another thread is executing this item. */
1705                         if (freeblks->fb_state & INPROGRESS) {
1706                                 wait_worklist(&freeblks->fb_list, "ptrwait");
1707                                 break;
1708                         }
1709                         /* Freeblks is waiting on a inode write. */
1710                         if ((freeblks->fb_state & COMPLETE) == 0) {
1711                                 FREE_LOCK(ump);
1712                                 ffs_update(vp, 1);
1713                                 ACQUIRE_LOCK(ump);
1714                                 break;
1715                         }
1716                         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1717                             (ALLCOMPLETE | ONWORKLIST)) {
1718                                 remove_from_worklist(&freeblks->fb_list);
1719                                 freeblks->fb_state |= INPROGRESS;
1720                                 FREE_LOCK(ump);
1721                                 if (vn_start_secondary_write(NULL, &mp,
1722                                     V_NOWAIT))
1723                                         panic("process_truncates: "
1724                                             "suspended filesystem");
1725                                 handle_workitem_freeblocks(freeblks, 0);
1726                                 vn_finished_secondary_write(mp);
1727                                 ACQUIRE_LOCK(ump);
1728                                 break;
1729                         }
1730                         if (freeblks->fb_cgwait)
1731                                 cgwait++;
1732                 }
1733                 if (cgwait) {
1734                         FREE_LOCK(ump);
1735                         sync_cgs(mp, MNT_WAIT);
1736                         ffs_sync_snap(mp, MNT_WAIT);
1737                         ACQUIRE_LOCK(ump);
1738                         continue;
1739                 }
1740                 if (freeblks == NULL)
1741                         break;
1742         }
1743         return;
1744 }
1745
1746 /*
1747  * Process one item on the worklist.
1748  */
1749 static int
1750 process_worklist_item(mp, target, flags)
1751         struct mount *mp;
1752         int target;
1753         int flags;
1754 {
1755         struct worklist sentinel;
1756         struct worklist *wk;
1757         struct ufsmount *ump;
1758         int matchcnt;
1759         int error;
1760
1761         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1762         /*
1763          * If we are being called because of a process doing a
1764          * copy-on-write, then it is not safe to write as we may
1765          * recurse into the copy-on-write routine.
1766          */
1767         if (curthread->td_pflags & TDP_COWINPROGRESS)
1768                 return (-1);
1769         PHOLD(curproc); /* Don't let the stack go away. */
1770         ump = VFSTOUFS(mp);
1771         LOCK_OWNED(ump);
1772         matchcnt = 0;
1773         sentinel.wk_mp = NULL;
1774         sentinel.wk_type = D_SENTINEL;
1775         LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1776         for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1777             wk = LIST_NEXT(&sentinel, wk_list)) {
1778                 if (wk->wk_type == D_SENTINEL) {
1779                         LIST_REMOVE(&sentinel, wk_list);
1780                         LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1781                         continue;
1782                 }
1783                 if (wk->wk_state & INPROGRESS)
1784                         panic("process_worklist_item: %p already in progress.",
1785                             wk);
1786                 wk->wk_state |= INPROGRESS;
1787                 remove_from_worklist(wk);
1788                 FREE_LOCK(ump);
1789                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1790                         panic("process_worklist_item: suspended filesystem");
1791                 switch (wk->wk_type) {
1792                 case D_DIRREM:
1793                         /* removal of a directory entry */
1794                         error = handle_workitem_remove(WK_DIRREM(wk), flags);
1795                         break;
1796
1797                 case D_FREEBLKS:
1798                         /* releasing blocks and/or fragments from a file */
1799                         error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1800                             flags);
1801                         break;
1802
1803                 case D_FREEFRAG:
1804                         /* releasing a fragment when replaced as a file grows */
1805                         handle_workitem_freefrag(WK_FREEFRAG(wk));
1806                         error = 0;
1807                         break;
1808
1809                 case D_FREEFILE:
1810                         /* releasing an inode when its link count drops to 0 */
1811                         handle_workitem_freefile(WK_FREEFILE(wk));
1812                         error = 0;
1813                         break;
1814
1815                 default:
1816                         panic("%s_process_worklist: Unknown type %s",
1817                             "softdep", TYPENAME(wk->wk_type));
1818                         /* NOTREACHED */
1819                 }
1820                 vn_finished_secondary_write(mp);
1821                 ACQUIRE_LOCK(ump);
1822                 if (error == 0) {
1823                         if (++matchcnt == target)
1824                                 break;
1825                         continue;
1826                 }
1827                 /*
1828                  * We have to retry the worklist item later.  Wake up any
1829                  * waiters who may be able to complete it immediately and
1830                  * add the item back to the head so we don't try to execute
1831                  * it again.
1832                  */
1833                 wk->wk_state &= ~INPROGRESS;
1834                 wake_worklist(wk);
1835                 add_to_worklist(wk, WK_HEAD);
1836         }
1837         LIST_REMOVE(&sentinel, wk_list);
1838         /* Sentinal could've become the tail from remove_from_worklist. */
1839         if (ump->softdep_worklist_tail == &sentinel)
1840                 ump->softdep_worklist_tail =
1841                     (struct worklist *)sentinel.wk_list.le_prev;
1842         PRELE(curproc);
1843         return (matchcnt);
1844 }
1845
1846 /*
1847  * Move dependencies from one buffer to another.
1848  */
1849 int
1850 softdep_move_dependencies(oldbp, newbp)
1851         struct buf *oldbp;
1852         struct buf *newbp;
1853 {
1854         struct worklist *wk, *wktail;
1855         struct ufsmount *ump;
1856         int dirty;
1857
1858         if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1859                 return (0);
1860         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1861             ("softdep_move_dependencies called on non-softdep filesystem"));
1862         dirty = 0;
1863         wktail = NULL;
1864         ump = VFSTOUFS(wk->wk_mp);
1865         ACQUIRE_LOCK(ump);
1866         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1867                 LIST_REMOVE(wk, wk_list);
1868                 if (wk->wk_type == D_BMSAFEMAP &&
1869                     bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1870                         dirty = 1;
1871                 if (wktail == NULL)
1872                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1873                 else
1874                         LIST_INSERT_AFTER(wktail, wk, wk_list);
1875                 wktail = wk;
1876         }
1877         FREE_LOCK(ump);
1878
1879         return (dirty);
1880 }
1881
1882 /*
1883  * Purge the work list of all items associated with a particular mount point.
1884  */
1885 int
1886 softdep_flushworklist(oldmnt, countp, td)
1887         struct mount *oldmnt;
1888         int *countp;
1889         struct thread *td;
1890 {
1891         struct vnode *devvp;
1892         struct ufsmount *ump;
1893         int count, error;
1894
1895         /*
1896          * Alternately flush the block device associated with the mount
1897          * point and process any dependencies that the flushing
1898          * creates. We continue until no more worklist dependencies
1899          * are found.
1900          */
1901         *countp = 0;
1902         error = 0;
1903         ump = VFSTOUFS(oldmnt);
1904         devvp = ump->um_devvp;
1905         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1906                 *countp += count;
1907                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1908                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1909                 VOP_UNLOCK(devvp, 0);
1910                 if (error != 0)
1911                         break;
1912         }
1913         return (error);
1914 }
1915
1916 #define SU_WAITIDLE_RETRIES     20
1917 static int
1918 softdep_waitidle(struct mount *mp, int flags __unused)
1919 {
1920         struct ufsmount *ump;
1921         struct vnode *devvp;
1922         struct thread *td;
1923         int error, i;
1924
1925         ump = VFSTOUFS(mp);
1926         devvp = ump->um_devvp;
1927         td = curthread;
1928         error = 0;
1929         ACQUIRE_LOCK(ump);
1930         for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1931                 ump->softdep_req = 1;
1932                 KASSERT((flags & FORCECLOSE) == 0 ||
1933                     ump->softdep_on_worklist == 0,
1934                     ("softdep_waitidle: work added after flush"));
1935                 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1936                     "softdeps", 10 * hz);
1937                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1938                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1939                 VOP_UNLOCK(devvp, 0);
1940                 ACQUIRE_LOCK(ump);
1941                 if (error != 0)
1942                         break;
1943         }
1944         ump->softdep_req = 0;
1945         if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1946                 error = EBUSY;
1947                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1948                     mp);
1949         }
1950         FREE_LOCK(ump);
1951         return (error);
1952 }
1953
1954 /*
1955  * Flush all vnodes and worklist items associated with a specified mount point.
1956  */
1957 int
1958 softdep_flushfiles(oldmnt, flags, td)
1959         struct mount *oldmnt;
1960         int flags;
1961         struct thread *td;
1962 {
1963 #ifdef QUOTA
1964         struct ufsmount *ump;
1965         int i;
1966 #endif
1967         int error, early, depcount, loopcnt, retry_flush_count, retry;
1968         int morework;
1969
1970         KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1971             ("softdep_flushfiles called on non-softdep filesystem"));
1972         loopcnt = 10;
1973         retry_flush_count = 3;
1974 retry_flush:
1975         error = 0;
1976
1977         /*
1978          * Alternately flush the vnodes associated with the mount
1979          * point and process any dependencies that the flushing
1980          * creates. In theory, this loop can happen at most twice,
1981          * but we give it a few extra just to be sure.
1982          */
1983         for (; loopcnt > 0; loopcnt--) {
1984                 /*
1985                  * Do another flush in case any vnodes were brought in
1986                  * as part of the cleanup operations.
1987                  */
1988                 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1989                     MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1990                 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1991                         break;
1992                 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1993                     depcount == 0)
1994                         break;
1995         }
1996         /*
1997          * If we are unmounting then it is an error to fail. If we
1998          * are simply trying to downgrade to read-only, then filesystem
1999          * activity can keep us busy forever, so we just fail with EBUSY.
2000          */
2001         if (loopcnt == 0) {
2002                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2003                         panic("softdep_flushfiles: looping");
2004                 error = EBUSY;
2005         }
2006         if (!error)
2007                 error = softdep_waitidle(oldmnt, flags);
2008         if (!error) {
2009                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2010                         retry = 0;
2011                         MNT_ILOCK(oldmnt);
2012                         KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2013                             ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2014                         morework = oldmnt->mnt_nvnodelistsize > 0;
2015 #ifdef QUOTA
2016                         ump = VFSTOUFS(oldmnt);
2017                         UFS_LOCK(ump);
2018                         for (i = 0; i < MAXQUOTAS; i++) {
2019                                 if (ump->um_quotas[i] != NULLVP)
2020                                         morework = 1;
2021                         }
2022                         UFS_UNLOCK(ump);
2023 #endif
2024                         if (morework) {
2025                                 if (--retry_flush_count > 0) {
2026                                         retry = 1;
2027                                         loopcnt = 3;
2028                                 } else
2029                                         error = EBUSY;
2030                         }
2031                         MNT_IUNLOCK(oldmnt);
2032                         if (retry)
2033                                 goto retry_flush;
2034                 }
2035         }
2036         return (error);
2037 }
2038
2039 /*
2040  * Structure hashing.
2041  *
2042  * There are four types of structures that can be looked up:
2043  *      1) pagedep structures identified by mount point, inode number,
2044  *         and logical block.
2045  *      2) inodedep structures identified by mount point and inode number.
2046  *      3) newblk structures identified by mount point and
2047  *         physical block number.
2048  *      4) bmsafemap structures identified by mount point and
2049  *         cylinder group number.
2050  *
2051  * The "pagedep" and "inodedep" dependency structures are hashed
2052  * separately from the file blocks and inodes to which they correspond.
2053  * This separation helps when the in-memory copy of an inode or
2054  * file block must be replaced. It also obviates the need to access
2055  * an inode or file page when simply updating (or de-allocating)
2056  * dependency structures. Lookup of newblk structures is needed to
2057  * find newly allocated blocks when trying to associate them with
2058  * their allocdirect or allocindir structure.
2059  *
2060  * The lookup routines optionally create and hash a new instance when
2061  * an existing entry is not found. The bmsafemap lookup routine always
2062  * allocates a new structure if an existing one is not found.
2063  */
2064 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
2065
2066 /*
2067  * Structures and routines associated with pagedep caching.
2068  */
2069 #define PAGEDEP_HASH(ump, inum, lbn) \
2070         (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2071
2072 static int
2073 pagedep_find(pagedephd, ino, lbn, pagedeppp)
2074         struct pagedep_hashhead *pagedephd;
2075         ino_t ino;
2076         ufs_lbn_t lbn;
2077         struct pagedep **pagedeppp;
2078 {
2079         struct pagedep *pagedep;
2080
2081         LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2082                 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2083                         *pagedeppp = pagedep;
2084                         return (1);
2085                 }
2086         }
2087         *pagedeppp = NULL;
2088         return (0);
2089 }
2090 /*
2091  * Look up a pagedep. Return 1 if found, 0 otherwise.
2092  * If not found, allocate if DEPALLOC flag is passed.
2093  * Found or allocated entry is returned in pagedeppp.
2094  * This routine must be called with splbio interrupts blocked.
2095  */
2096 static int
2097 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2098         struct mount *mp;
2099         struct buf *bp;
2100         ino_t ino;
2101         ufs_lbn_t lbn;
2102         int flags;
2103         struct pagedep **pagedeppp;
2104 {
2105         struct pagedep *pagedep;
2106         struct pagedep_hashhead *pagedephd;
2107         struct worklist *wk;
2108         struct ufsmount *ump;
2109         int ret;
2110         int i;
2111
2112         ump = VFSTOUFS(mp);
2113         LOCK_OWNED(ump);
2114         if (bp) {
2115                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2116                         if (wk->wk_type == D_PAGEDEP) {
2117                                 *pagedeppp = WK_PAGEDEP(wk);
2118                                 return (1);
2119                         }
2120                 }
2121         }
2122         pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2123         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2124         if (ret) {
2125                 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2126                         WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2127                 return (1);
2128         }
2129         if ((flags & DEPALLOC) == 0)
2130                 return (0);
2131         FREE_LOCK(ump);
2132         pagedep = malloc(sizeof(struct pagedep),
2133             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2134         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2135         ACQUIRE_LOCK(ump);
2136         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2137         if (*pagedeppp) {
2138                 /*
2139                  * This should never happen since we only create pagedeps
2140                  * with the vnode lock held.  Could be an assert.
2141                  */
2142                 WORKITEM_FREE(pagedep, D_PAGEDEP);
2143                 return (ret);
2144         }
2145         pagedep->pd_ino = ino;
2146         pagedep->pd_lbn = lbn;
2147         LIST_INIT(&pagedep->pd_dirremhd);
2148         LIST_INIT(&pagedep->pd_pendinghd);
2149         for (i = 0; i < DAHASHSZ; i++)
2150                 LIST_INIT(&pagedep->pd_diraddhd[i]);
2151         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2152         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2153         *pagedeppp = pagedep;
2154         return (0);
2155 }
2156
2157 /*
2158  * Structures and routines associated with inodedep caching.
2159  */
2160 #define INODEDEP_HASH(ump, inum) \
2161       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2162
2163 static int
2164 inodedep_find(inodedephd, inum, inodedeppp)
2165         struct inodedep_hashhead *inodedephd;
2166         ino_t inum;
2167         struct inodedep **inodedeppp;
2168 {
2169         struct inodedep *inodedep;
2170
2171         LIST_FOREACH(inodedep, inodedephd, id_hash)
2172                 if (inum == inodedep->id_ino)
2173                         break;
2174         if (inodedep) {
2175                 *inodedeppp = inodedep;
2176                 return (1);
2177         }
2178         *inodedeppp = NULL;
2179
2180         return (0);
2181 }
2182 /*
2183  * Look up an inodedep. Return 1 if found, 0 if not found.
2184  * If not found, allocate if DEPALLOC flag is passed.
2185  * Found or allocated entry is returned in inodedeppp.
2186  * This routine must be called with splbio interrupts blocked.
2187  */
2188 static int
2189 inodedep_lookup(mp, inum, flags, inodedeppp)
2190         struct mount *mp;
2191         ino_t inum;
2192         int flags;
2193         struct inodedep **inodedeppp;
2194 {
2195         struct inodedep *inodedep;
2196         struct inodedep_hashhead *inodedephd;
2197         struct ufsmount *ump;
2198         struct fs *fs;
2199
2200         ump = VFSTOUFS(mp);
2201         LOCK_OWNED(ump);
2202         fs = ump->um_fs;
2203         inodedephd = INODEDEP_HASH(ump, inum);
2204
2205         if (inodedep_find(inodedephd, inum, inodedeppp))
2206                 return (1);
2207         if ((flags & DEPALLOC) == 0)
2208                 return (0);
2209         /*
2210          * If the system is over its limit and our filesystem is
2211          * responsible for more than our share of that usage and
2212          * we are not in a rush, request some inodedep cleanup.
2213          */
2214         if (softdep_excess_items(ump, D_INODEDEP))
2215                 schedule_cleanup(mp);
2216         else
2217                 FREE_LOCK(ump);
2218         inodedep = malloc(sizeof(struct inodedep),
2219                 M_INODEDEP, M_SOFTDEP_FLAGS);
2220         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2221         ACQUIRE_LOCK(ump);
2222         if (inodedep_find(inodedephd, inum, inodedeppp)) {
2223                 WORKITEM_FREE(inodedep, D_INODEDEP);
2224                 return (1);
2225         }
2226         inodedep->id_fs = fs;
2227         inodedep->id_ino = inum;
2228         inodedep->id_state = ALLCOMPLETE;
2229         inodedep->id_nlinkdelta = 0;
2230         inodedep->id_savedino1 = NULL;
2231         inodedep->id_savedsize = -1;
2232         inodedep->id_savedextsize = -1;
2233         inodedep->id_savednlink = -1;
2234         inodedep->id_bmsafemap = NULL;
2235         inodedep->id_mkdiradd = NULL;
2236         LIST_INIT(&inodedep->id_dirremhd);
2237         LIST_INIT(&inodedep->id_pendinghd);
2238         LIST_INIT(&inodedep->id_inowait);
2239         LIST_INIT(&inodedep->id_bufwait);
2240         TAILQ_INIT(&inodedep->id_inoreflst);
2241         TAILQ_INIT(&inodedep->id_inoupdt);
2242         TAILQ_INIT(&inodedep->id_newinoupdt);
2243         TAILQ_INIT(&inodedep->id_extupdt);
2244         TAILQ_INIT(&inodedep->id_newextupdt);
2245         TAILQ_INIT(&inodedep->id_freeblklst);
2246         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2247         *inodedeppp = inodedep;
2248         return (0);
2249 }
2250
2251 /*
2252  * Structures and routines associated with newblk caching.
2253  */
2254 #define NEWBLK_HASH(ump, inum) \
2255         (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2256
2257 static int
2258 newblk_find(newblkhd, newblkno, flags, newblkpp)
2259         struct newblk_hashhead *newblkhd;
2260         ufs2_daddr_t newblkno;
2261         int flags;
2262         struct newblk **newblkpp;
2263 {
2264         struct newblk *newblk;
2265
2266         LIST_FOREACH(newblk, newblkhd, nb_hash) {
2267                 if (newblkno != newblk->nb_newblkno)
2268                         continue;
2269                 /*
2270                  * If we're creating a new dependency don't match those that
2271                  * have already been converted to allocdirects.  This is for
2272                  * a frag extend.
2273                  */
2274                 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2275                         continue;
2276                 break;
2277         }
2278         if (newblk) {
2279                 *newblkpp = newblk;
2280                 return (1);
2281         }
2282         *newblkpp = NULL;
2283         return (0);
2284 }
2285
2286 /*
2287  * Look up a newblk. Return 1 if found, 0 if not found.
2288  * If not found, allocate if DEPALLOC flag is passed.
2289  * Found or allocated entry is returned in newblkpp.
2290  */
2291 static int
2292 newblk_lookup(mp, newblkno, flags, newblkpp)
2293         struct mount *mp;
2294         ufs2_daddr_t newblkno;
2295         int flags;
2296         struct newblk **newblkpp;
2297 {
2298         struct newblk *newblk;
2299         struct newblk_hashhead *newblkhd;
2300         struct ufsmount *ump;
2301
2302         ump = VFSTOUFS(mp);
2303         LOCK_OWNED(ump);
2304         newblkhd = NEWBLK_HASH(ump, newblkno);
2305         if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2306                 return (1);
2307         if ((flags & DEPALLOC) == 0)
2308                 return (0);
2309         if (softdep_excess_items(ump, D_NEWBLK) ||
2310             softdep_excess_items(ump, D_ALLOCDIRECT) ||
2311             softdep_excess_items(ump, D_ALLOCINDIR))
2312                 schedule_cleanup(mp);
2313         else
2314                 FREE_LOCK(ump);
2315         newblk = malloc(sizeof(union allblk), M_NEWBLK,
2316             M_SOFTDEP_FLAGS | M_ZERO);
2317         workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2318         ACQUIRE_LOCK(ump);
2319         if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2320                 WORKITEM_FREE(newblk, D_NEWBLK);
2321                 return (1);
2322         }
2323         newblk->nb_freefrag = NULL;
2324         LIST_INIT(&newblk->nb_indirdeps);
2325         LIST_INIT(&newblk->nb_newdirblk);
2326         LIST_INIT(&newblk->nb_jwork);
2327         newblk->nb_state = ATTACHED;
2328         newblk->nb_newblkno = newblkno;
2329         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2330         *newblkpp = newblk;
2331         return (0);
2332 }
2333
2334 /*
2335  * Structures and routines associated with freed indirect block caching.
2336  */
2337 #define INDIR_HASH(ump, blkno) \
2338         (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2339
2340 /*
2341  * Lookup an indirect block in the indir hash table.  The freework is
2342  * removed and potentially freed.  The caller must do a blocking journal
2343  * write before writing to the blkno.
2344  */
2345 static int
2346 indirblk_lookup(mp, blkno)
2347         struct mount *mp;
2348         ufs2_daddr_t blkno;
2349 {
2350         struct freework *freework;
2351         struct indir_hashhead *wkhd;
2352         struct ufsmount *ump;
2353
2354         ump = VFSTOUFS(mp);
2355         wkhd = INDIR_HASH(ump, blkno);
2356         TAILQ_FOREACH(freework, wkhd, fw_next) {
2357                 if (freework->fw_blkno != blkno)
2358                         continue;
2359                 indirblk_remove(freework);
2360                 return (1);
2361         }
2362         return (0);
2363 }
2364
2365 /*
2366  * Insert an indirect block represented by freework into the indirblk
2367  * hash table so that it may prevent the block from being re-used prior
2368  * to the journal being written.
2369  */
2370 static void
2371 indirblk_insert(freework)
2372         struct freework *freework;
2373 {
2374         struct jblocks *jblocks;
2375         struct jseg *jseg;
2376         struct ufsmount *ump;
2377
2378         ump = VFSTOUFS(freework->fw_list.wk_mp);
2379         jblocks = ump->softdep_jblocks;
2380         jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2381         if (jseg == NULL)
2382                 return;
2383
2384         LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2385         TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2386             fw_next);
2387         freework->fw_state &= ~DEPCOMPLETE;
2388 }
2389
2390 static void
2391 indirblk_remove(freework)
2392         struct freework *freework;
2393 {
2394         struct ufsmount *ump;
2395
2396         ump = VFSTOUFS(freework->fw_list.wk_mp);
2397         LIST_REMOVE(freework, fw_segs);
2398         TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2399         freework->fw_state |= DEPCOMPLETE;
2400         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2401                 WORKITEM_FREE(freework, D_FREEWORK);
2402 }
2403
2404 /*
2405  * Executed during filesystem system initialization before
2406  * mounting any filesystems.
2407  */
2408 void
2409 softdep_initialize()
2410 {
2411
2412         TAILQ_INIT(&softdepmounts);
2413 #ifdef __LP64__
2414         max_softdeps = desiredvnodes * 4;
2415 #else
2416         max_softdeps = desiredvnodes * 2;
2417 #endif
2418
2419         /* initialise bioops hack */
2420         bioops.io_start = softdep_disk_io_initiation;
2421         bioops.io_complete = softdep_disk_write_complete;
2422         bioops.io_deallocate = softdep_deallocate_dependencies;
2423         bioops.io_countdeps = softdep_count_dependencies;
2424         softdep_ast_cleanup = softdep_ast_cleanup_proc;
2425
2426         /* Initialize the callout with an mtx. */
2427         callout_init_mtx(&softdep_callout, &lk, 0);
2428 }
2429
2430 /*
2431  * Executed after all filesystems have been unmounted during
2432  * filesystem module unload.
2433  */
2434 void
2435 softdep_uninitialize()
2436 {
2437
2438         /* clear bioops hack */
2439         bioops.io_start = NULL;
2440         bioops.io_complete = NULL;
2441         bioops.io_deallocate = NULL;
2442         bioops.io_countdeps = NULL;
2443         softdep_ast_cleanup = NULL;
2444
2445         callout_drain(&softdep_callout);
2446 }
2447
2448 /*
2449  * Called at mount time to notify the dependency code that a
2450  * filesystem wishes to use it.
2451  */
2452 int
2453 softdep_mount(devvp, mp, fs, cred)
2454         struct vnode *devvp;
2455         struct mount *mp;
2456         struct fs *fs;
2457         struct ucred *cred;
2458 {
2459         struct csum_total cstotal;
2460         struct mount_softdeps *sdp;
2461         struct ufsmount *ump;
2462         struct cg *cgp;
2463         struct buf *bp;
2464         int i, error, cyl;
2465
2466         sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2467             M_WAITOK | M_ZERO);
2468         MNT_ILOCK(mp);
2469         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2470         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2471                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2472                         MNTK_SOFTDEP | MNTK_NOASYNC;
2473         }
2474         ump = VFSTOUFS(mp);
2475         ump->um_softdep = sdp;
2476         MNT_IUNLOCK(mp);
2477         rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2478         sdp->sd_ump = ump;
2479         LIST_INIT(&ump->softdep_workitem_pending);
2480         LIST_INIT(&ump->softdep_journal_pending);
2481         TAILQ_INIT(&ump->softdep_unlinked);
2482         LIST_INIT(&ump->softdep_dirtycg);
2483         ump->softdep_worklist_tail = NULL;
2484         ump->softdep_on_worklist = 0;
2485         ump->softdep_deps = 0;
2486         LIST_INIT(&ump->softdep_mkdirlisthd);
2487         ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2488             &ump->pagedep_hash_size);
2489         ump->pagedep_nextclean = 0;
2490         ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2491             &ump->inodedep_hash_size);
2492         ump->inodedep_nextclean = 0;
2493         ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2494             &ump->newblk_hash_size);
2495         ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2496             &ump->bmsafemap_hash_size);
2497         i = 1 << (ffs(desiredvnodes / 10) - 1);
2498         ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2499             M_FREEWORK, M_WAITOK);
2500         ump->indir_hash_size = i - 1;
2501         for (i = 0; i <= ump->indir_hash_size; i++)
2502                 TAILQ_INIT(&ump->indir_hashtbl[i]);
2503         ACQUIRE_GBLLOCK(&lk);
2504         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2505         FREE_GBLLOCK(&lk);
2506         if ((fs->fs_flags & FS_SUJ) &&
2507             (error = journal_mount(mp, fs, cred)) != 0) {
2508                 printf("Failed to start journal: %d\n", error);
2509                 softdep_unmount(mp);
2510                 return (error);
2511         }
2512         /*
2513          * Start our flushing thread in the bufdaemon process.
2514          */
2515         ACQUIRE_LOCK(ump);
2516         ump->softdep_flags |= FLUSH_STARTING;
2517         FREE_LOCK(ump);
2518         kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2519             &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2520             mp->mnt_stat.f_mntonname);
2521         ACQUIRE_LOCK(ump);
2522         while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2523                 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2524                     hz / 2);
2525         }
2526         FREE_LOCK(ump);
2527         /*
2528          * When doing soft updates, the counters in the
2529          * superblock may have gotten out of sync. Recomputation
2530          * can take a long time and can be deferred for background
2531          * fsck.  However, the old behavior of scanning the cylinder
2532          * groups and recalculating them at mount time is available
2533          * by setting vfs.ffs.compute_summary_at_mount to one.
2534          */
2535         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2536                 return (0);
2537         bzero(&cstotal, sizeof cstotal);
2538         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2539                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2540                     fs->fs_cgsize, cred, &bp)) != 0) {
2541                         brelse(bp);
2542                         softdep_unmount(mp);
2543                         return (error);
2544                 }
2545                 cgp = (struct cg *)bp->b_data;
2546                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2547                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2548                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2549                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2550                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
2551                 brelse(bp);
2552         }
2553 #ifdef DEBUG
2554         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2555                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2556 #endif
2557         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2558         return (0);
2559 }
2560
2561 void
2562 softdep_unmount(mp)
2563         struct mount *mp;
2564 {
2565         struct ufsmount *ump;
2566 #ifdef INVARIANTS
2567         int i;
2568 #endif
2569
2570         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2571             ("softdep_unmount called on non-softdep filesystem"));
2572         ump = VFSTOUFS(mp);
2573         MNT_ILOCK(mp);
2574         mp->mnt_flag &= ~MNT_SOFTDEP;
2575         if (MOUNTEDSUJ(mp) == 0) {
2576                 MNT_IUNLOCK(mp);
2577         } else {
2578                 mp->mnt_flag &= ~MNT_SUJ;
2579                 MNT_IUNLOCK(mp);
2580                 journal_unmount(ump);
2581         }
2582         /*
2583          * Shut down our flushing thread. Check for NULL is if
2584          * softdep_mount errors out before the thread has been created.
2585          */
2586         if (ump->softdep_flushtd != NULL) {
2587                 ACQUIRE_LOCK(ump);
2588                 ump->softdep_flags |= FLUSH_EXIT;
2589                 wakeup(&ump->softdep_flushtd);
2590                 msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2591                     "sdwait", 0);
2592                 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2593                     ("Thread shutdown failed"));
2594         }
2595         /*
2596          * Free up our resources.
2597          */
2598         ACQUIRE_GBLLOCK(&lk);
2599         TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2600         FREE_GBLLOCK(&lk);
2601         rw_destroy(LOCK_PTR(ump));
2602         hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2603         hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2604         hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2605         hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2606             ump->bmsafemap_hash_size);
2607         free(ump->indir_hashtbl, M_FREEWORK);
2608 #ifdef INVARIANTS
2609         for (i = 0; i <= D_LAST; i++)
2610                 KASSERT(ump->softdep_curdeps[i] == 0,
2611                     ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2612                     TYPENAME(i), ump->softdep_curdeps[i]));
2613 #endif
2614         free(ump->um_softdep, M_MOUNTDATA);
2615 }
2616
2617 static struct jblocks *
2618 jblocks_create(void)
2619 {
2620         struct jblocks *jblocks;
2621
2622         jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2623         TAILQ_INIT(&jblocks->jb_segs);
2624         jblocks->jb_avail = 10;
2625         jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2626             M_JBLOCKS, M_WAITOK | M_ZERO);
2627
2628         return (jblocks);
2629 }
2630
2631 static ufs2_daddr_t
2632 jblocks_alloc(jblocks, bytes, actual)
2633         struct jblocks *jblocks;
2634         int bytes;
2635         int *actual;
2636 {
2637         ufs2_daddr_t daddr;
2638         struct jextent *jext;
2639         int freecnt;
2640         int blocks;
2641
2642         blocks = bytes / DEV_BSIZE;
2643         jext = &jblocks->jb_extent[jblocks->jb_head];
2644         freecnt = jext->je_blocks - jblocks->jb_off;
2645         if (freecnt == 0) {
2646                 jblocks->jb_off = 0;
2647                 if (++jblocks->jb_head > jblocks->jb_used)
2648                         jblocks->jb_head = 0;
2649                 jext = &jblocks->jb_extent[jblocks->jb_head];
2650                 freecnt = jext->je_blocks;
2651         }
2652         if (freecnt > blocks)
2653                 freecnt = blocks;
2654         *actual = freecnt * DEV_BSIZE;
2655         daddr = jext->je_daddr + jblocks->jb_off;
2656         jblocks->jb_off += freecnt;
2657         jblocks->jb_free -= freecnt;
2658
2659         return (daddr);
2660 }
2661
2662 static void
2663 jblocks_free(jblocks, mp, bytes)
2664         struct jblocks *jblocks;
2665         struct mount *mp;
2666         int bytes;
2667 {
2668
2669         LOCK_OWNED(VFSTOUFS(mp));
2670         jblocks->jb_free += bytes / DEV_BSIZE;
2671         if (jblocks->jb_suspended)
2672                 worklist_speedup(mp);
2673         wakeup(jblocks);
2674 }
2675
2676 static void
2677 jblocks_destroy(jblocks)
2678         struct jblocks *jblocks;
2679 {
2680
2681         if (jblocks->jb_extent)
2682                 free(jblocks->jb_extent, M_JBLOCKS);
2683         free(jblocks, M_JBLOCKS);
2684 }
2685
2686 static void
2687 jblocks_add(jblocks, daddr, blocks)
2688         struct jblocks *jblocks;
2689         ufs2_daddr_t daddr;
2690         int blocks;
2691 {
2692         struct jextent *jext;
2693
2694         jblocks->jb_blocks += blocks;
2695         jblocks->jb_free += blocks;
2696         jext = &jblocks->jb_extent[jblocks->jb_used];
2697         /* Adding the first block. */
2698         if (jext->je_daddr == 0) {
2699                 jext->je_daddr = daddr;
2700                 jext->je_blocks = blocks;
2701                 return;
2702         }
2703         /* Extending the last extent. */
2704         if (jext->je_daddr + jext->je_blocks == daddr) {
2705                 jext->je_blocks += blocks;
2706                 return;
2707         }
2708         /* Adding a new extent. */
2709         if (++jblocks->jb_used == jblocks->jb_avail) {
2710                 jblocks->jb_avail *= 2;
2711                 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2712                     M_JBLOCKS, M_WAITOK | M_ZERO);
2713                 memcpy(jext, jblocks->jb_extent,
2714                     sizeof(struct jextent) * jblocks->jb_used);
2715                 free(jblocks->jb_extent, M_JBLOCKS);
2716                 jblocks->jb_extent = jext;
2717         }
2718         jext = &jblocks->jb_extent[jblocks->jb_used];
2719         jext->je_daddr = daddr;
2720         jext->je_blocks = blocks;
2721         return;
2722 }
2723
2724 int
2725 softdep_journal_lookup(mp, vpp)
2726         struct mount *mp;
2727         struct vnode **vpp;
2728 {
2729         struct componentname cnp;
2730         struct vnode *dvp;
2731         ino_t sujournal;
2732         int error;
2733
2734         error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2735         if (error)
2736                 return (error);
2737         bzero(&cnp, sizeof(cnp));
2738         cnp.cn_nameiop = LOOKUP;
2739         cnp.cn_flags = ISLASTCN;
2740         cnp.cn_thread = curthread;
2741         cnp.cn_cred = curthread->td_ucred;
2742         cnp.cn_pnbuf = SUJ_FILE;
2743         cnp.cn_nameptr = SUJ_FILE;
2744         cnp.cn_namelen = strlen(SUJ_FILE);
2745         error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2746         vput(dvp);
2747         if (error != 0)
2748                 return (error);
2749         error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2750         return (error);
2751 }
2752
2753 /*
2754  * Open and verify the journal file.
2755  */
2756 static int
2757 journal_mount(mp, fs, cred)
2758         struct mount *mp;
2759         struct fs *fs;
2760         struct ucred *cred;
2761 {
2762         struct jblocks *jblocks;
2763         struct ufsmount *ump;
2764         struct vnode *vp;
2765         struct inode *ip;
2766         ufs2_daddr_t blkno;
2767         int bcount;
2768         int error;
2769         int i;
2770
2771         ump = VFSTOUFS(mp);
2772         ump->softdep_journal_tail = NULL;
2773         ump->softdep_on_journal = 0;
2774         ump->softdep_accdeps = 0;
2775         ump->softdep_req = 0;
2776         ump->softdep_jblocks = NULL;
2777         error = softdep_journal_lookup(mp, &vp);
2778         if (error != 0) {
2779                 printf("Failed to find journal.  Use tunefs to create one\n");
2780                 return (error);
2781         }
2782         ip = VTOI(vp);
2783         if (ip->i_size < SUJ_MIN) {
2784                 error = ENOSPC;
2785                 goto out;
2786         }
2787         bcount = lblkno(fs, ip->i_size);        /* Only use whole blocks. */
2788         jblocks = jblocks_create();
2789         for (i = 0; i < bcount; i++) {
2790                 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2791                 if (error)
2792                         break;
2793                 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2794         }
2795         if (error) {
2796                 jblocks_destroy(jblocks);
2797                 goto out;
2798         }
2799         jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
2800         jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2801         ump->softdep_jblocks = jblocks;
2802 out:
2803         if (error == 0) {
2804                 MNT_ILOCK(mp);
2805                 mp->mnt_flag |= MNT_SUJ;
2806                 mp->mnt_flag &= ~MNT_SOFTDEP;
2807                 MNT_IUNLOCK(mp);
2808                 /*
2809                  * Only validate the journal contents if the
2810                  * filesystem is clean, otherwise we write the logs
2811                  * but they'll never be used.  If the filesystem was
2812                  * still dirty when we mounted it the journal is
2813                  * invalid and a new journal can only be valid if it
2814                  * starts from a clean mount.
2815                  */
2816                 if (fs->fs_clean) {
2817                         DIP_SET(ip, i_modrev, fs->fs_mtime);
2818                         ip->i_flags |= IN_MODIFIED;
2819                         ffs_update(vp, 1);
2820                 }
2821         }
2822         vput(vp);
2823         return (error);
2824 }
2825
2826 static void
2827 journal_unmount(ump)
2828         struct ufsmount *ump;
2829 {
2830
2831         if (ump->softdep_jblocks)
2832                 jblocks_destroy(ump->softdep_jblocks);
2833         ump->softdep_jblocks = NULL;
2834 }
2835
2836 /*
2837  * Called when a journal record is ready to be written.  Space is allocated
2838  * and the journal entry is created when the journal is flushed to stable
2839  * store.
2840  */
2841 static void
2842 add_to_journal(wk)
2843         struct worklist *wk;
2844 {
2845         struct ufsmount *ump;
2846
2847         ump = VFSTOUFS(wk->wk_mp);
2848         LOCK_OWNED(ump);
2849         if (wk->wk_state & ONWORKLIST)
2850                 panic("add_to_journal: %s(0x%X) already on list",
2851                     TYPENAME(wk->wk_type), wk->wk_state);
2852         wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2853         if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2854                 ump->softdep_jblocks->jb_age = ticks;
2855                 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2856         } else
2857                 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2858         ump->softdep_journal_tail = wk;
2859         ump->softdep_on_journal += 1;
2860 }
2861
2862 /*
2863  * Remove an arbitrary item for the journal worklist maintain the tail
2864  * pointer.  This happens when a new operation obviates the need to
2865  * journal an old operation.
2866  */
2867 static void
2868 remove_from_journal(wk)
2869         struct worklist *wk;
2870 {
2871         struct ufsmount *ump;
2872
2873         ump = VFSTOUFS(wk->wk_mp);
2874         LOCK_OWNED(ump);
2875 #ifdef SUJ_DEBUG
2876         {
2877                 struct worklist *wkn;
2878
2879                 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2880                         if (wkn == wk)
2881                                 break;
2882                 if (wkn == NULL)
2883                         panic("remove_from_journal: %p is not in journal", wk);
2884         }
2885 #endif
2886         /*
2887          * We emulate a TAILQ to save space in most structures which do not
2888          * require TAILQ semantics.  Here we must update the tail position
2889          * when removing the tail which is not the final entry. This works
2890          * only if the worklist linkage are at the beginning of the structure.
2891          */
2892         if (ump->softdep_journal_tail == wk)
2893                 ump->softdep_journal_tail =
2894                     (struct worklist *)wk->wk_list.le_prev;
2895
2896         WORKLIST_REMOVE(wk);
2897         ump->softdep_on_journal -= 1;
2898 }
2899
2900 /*
2901  * Check for journal space as well as dependency limits so the prelink
2902  * code can throttle both journaled and non-journaled filesystems.
2903  * Threshold is 0 for low and 1 for min.
2904  */
2905 static int
2906 journal_space(ump, thresh)
2907         struct ufsmount *ump;
2908         int thresh;
2909 {
2910         struct jblocks *jblocks;
2911         int limit, avail;
2912
2913         jblocks = ump->softdep_jblocks;
2914         if (jblocks == NULL)
2915                 return (1);
2916         /*
2917          * We use a tighter restriction here to prevent request_cleanup()
2918          * running in threads from running into locks we currently hold.
2919          * We have to be over the limit and our filesystem has to be
2920          * responsible for more than our share of that usage.
2921          */
2922         limit = (max_softdeps / 10) * 9;
2923         if (dep_current[D_INODEDEP] > limit &&
2924             ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2925                 return (0);
2926         if (thresh)
2927                 thresh = jblocks->jb_min;
2928         else
2929                 thresh = jblocks->jb_low;
2930         avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2931         avail = jblocks->jb_free - avail;
2932
2933         return (avail > thresh);
2934 }
2935
2936 static void
2937 journal_suspend(ump)
2938         struct ufsmount *ump;
2939 {
2940         struct jblocks *jblocks;
2941         struct mount *mp;
2942
2943         mp = UFSTOVFS(ump);
2944         jblocks = ump->softdep_jblocks;
2945         MNT_ILOCK(mp);
2946         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2947                 stat_journal_min++;
2948                 mp->mnt_kern_flag |= MNTK_SUSPEND;
2949                 mp->mnt_susp_owner = ump->softdep_flushtd;
2950         }
2951         jblocks->jb_suspended = 1;
2952         MNT_IUNLOCK(mp);
2953 }
2954
2955 static int
2956 journal_unsuspend(struct ufsmount *ump)
2957 {
2958         struct jblocks *jblocks;
2959         struct mount *mp;
2960
2961         mp = UFSTOVFS(ump);
2962         jblocks = ump->softdep_jblocks;
2963
2964         if (jblocks != NULL && jblocks->jb_suspended &&
2965             journal_space(ump, jblocks->jb_min)) {
2966                 jblocks->jb_suspended = 0;
2967                 FREE_LOCK(ump);
2968                 mp->mnt_susp_owner = curthread;
2969                 vfs_write_resume(mp, 0);
2970                 ACQUIRE_LOCK(ump);
2971                 return (1);
2972         }
2973         return (0);
2974 }
2975
2976 /*
2977  * Called before any allocation function to be certain that there is
2978  * sufficient space in the journal prior to creating any new records.
2979  * Since in the case of block allocation we may have multiple locked
2980  * buffers at the time of the actual allocation we can not block
2981  * when the journal records are created.  Doing so would create a deadlock
2982  * if any of these buffers needed to be flushed to reclaim space.  Instead
2983  * we require a sufficiently large amount of available space such that
2984  * each thread in the system could have passed this allocation check and
2985  * still have sufficient free space.  With 20% of a minimum journal size
2986  * of 1MB we have 6553 records available.
2987  */
2988 int
2989 softdep_prealloc(vp, waitok)
2990         struct vnode *vp;
2991         int waitok;
2992 {
2993         struct ufsmount *ump;
2994
2995         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2996             ("softdep_prealloc called on non-softdep filesystem"));
2997         /*
2998          * Nothing to do if we are not running journaled soft updates.
2999          * If we currently hold the snapshot lock, we must avoid
3000          * handling other resources that could cause deadlock.  Do not
3001          * touch quotas vnode since it is typically recursed with
3002          * other vnode locks held.
3003          */
3004         if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3005             (vp->v_vflag & VV_SYSTEM) != 0)
3006                 return (0);
3007         ump = VFSTOUFS(vp->v_mount);
3008         ACQUIRE_LOCK(ump);
3009         if (journal_space(ump, 0)) {
3010                 FREE_LOCK(ump);
3011                 return (0);
3012         }
3013         stat_journal_low++;
3014         FREE_LOCK(ump);
3015         if (waitok == MNT_NOWAIT)
3016                 return (ENOSPC);
3017         /*
3018          * Attempt to sync this vnode once to flush any journal
3019          * work attached to it.
3020          */
3021         if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3022                 ffs_syncvnode(vp, waitok, 0);
3023         ACQUIRE_LOCK(ump);
3024         process_removes(vp);
3025         process_truncates(vp);
3026         if (journal_space(ump, 0) == 0) {
3027                 softdep_speedup(ump);
3028                 if (journal_space(ump, 1) == 0)
3029                         journal_suspend(ump);
3030         }
3031         FREE_LOCK(ump);
3032
3033         return (0);
3034 }
3035
3036 /*
3037  * Before adjusting a link count on a vnode verify that we have sufficient
3038  * journal space.  If not, process operations that depend on the currently
3039  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3040  * and softdep flush threads can not acquire these locks to reclaim space.
3041  */
3042 static void
3043 softdep_prelink(dvp, vp)
3044         struct vnode *dvp;
3045         struct vnode *vp;
3046 {
3047         struct ufsmount *ump;
3048
3049         ump = VFSTOUFS(dvp->v_mount);
3050         LOCK_OWNED(ump);
3051         /*
3052          * Nothing to do if we have sufficient journal space.
3053          * If we currently hold the snapshot lock, we must avoid
3054          * handling other resources that could cause deadlock.
3055          */
3056         if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3057                 return;
3058         stat_journal_low++;
3059         FREE_LOCK(ump);
3060         if (vp)
3061                 ffs_syncvnode(vp, MNT_NOWAIT, 0);
3062         ffs_syncvnode(dvp, MNT_WAIT, 0);
3063         ACQUIRE_LOCK(ump);
3064         /* Process vp before dvp as it may create .. removes. */
3065         if (vp) {
3066                 process_removes(vp);
3067                 process_truncates(vp);
3068         }
3069         process_removes(dvp);
3070         process_truncates(dvp);
3071         softdep_speedup(ump);
3072         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3073         if (journal_space(ump, 0) == 0) {
3074                 softdep_speedup(ump);
3075                 if (journal_space(ump, 1) == 0)
3076                         journal_suspend(ump);
3077         }
3078 }
3079
3080 static void
3081 jseg_write(ump, jseg, data)
3082         struct ufsmount *ump;
3083         struct jseg *jseg;
3084         uint8_t *data;
3085 {
3086         struct jsegrec *rec;
3087
3088         rec = (struct jsegrec *)data;
3089         rec->jsr_seq = jseg->js_seq;
3090         rec->jsr_oldest = jseg->js_oldseq;
3091         rec->jsr_cnt = jseg->js_cnt;
3092         rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3093         rec->jsr_crc = 0;
3094         rec->jsr_time = ump->um_fs->fs_mtime;
3095 }
3096
3097 static inline void
3098 inoref_write(inoref, jseg, rec)
3099         struct inoref *inoref;
3100         struct jseg *jseg;
3101         struct jrefrec *rec;
3102 {
3103
3104         inoref->if_jsegdep->jd_seg = jseg;
3105         rec->jr_ino = inoref->if_ino;
3106         rec->jr_parent = inoref->if_parent;
3107         rec->jr_nlink = inoref->if_nlink;
3108         rec->jr_mode = inoref->if_mode;
3109         rec->jr_diroff = inoref->if_diroff;
3110 }
3111
3112 static void
3113 jaddref_write(jaddref, jseg, data)
3114         struct jaddref *jaddref;
3115         struct jseg *jseg;
3116         uint8_t *data;
3117 {
3118         struct jrefrec *rec;
3119
3120         rec = (struct jrefrec *)data;
3121         rec->jr_op = JOP_ADDREF;
3122         inoref_write(&jaddref->ja_ref, jseg, rec);
3123 }
3124
3125 static void
3126 jremref_write(jremref, jseg, data)
3127         struct jremref *jremref;
3128         struct jseg *jseg;
3129         uint8_t *data;
3130 {
3131         struct jrefrec *rec;
3132
3133         rec = (struct jrefrec *)data;
3134         rec->jr_op = JOP_REMREF;
3135         inoref_write(&jremref->jr_ref, jseg, rec);
3136 }
3137
3138 static void
3139 jmvref_write(jmvref, jseg, data)
3140         struct jmvref *jmvref;
3141         struct jseg *jseg;
3142         uint8_t *data;
3143 {
3144         struct jmvrec *rec;
3145
3146         rec = (struct jmvrec *)data;
3147         rec->jm_op = JOP_MVREF;
3148         rec->jm_ino = jmvref->jm_ino;
3149         rec->jm_parent = jmvref->jm_parent;
3150         rec->jm_oldoff = jmvref->jm_oldoff;
3151         rec->jm_newoff = jmvref->jm_newoff;
3152 }
3153
3154 static void
3155 jnewblk_write(jnewblk, jseg, data)
3156         struct jnewblk *jnewblk;
3157         struct jseg *jseg;
3158         uint8_t *data;
3159 {
3160         struct jblkrec *rec;
3161
3162         jnewblk->jn_jsegdep->jd_seg = jseg;
3163         rec = (struct jblkrec *)data;
3164         rec->jb_op = JOP_NEWBLK;
3165         rec->jb_ino = jnewblk->jn_ino;
3166         rec->jb_blkno = jnewblk->jn_blkno;
3167         rec->jb_lbn = jnewblk->jn_lbn;
3168         rec->jb_frags = jnewblk->jn_frags;
3169         rec->jb_oldfrags = jnewblk->jn_oldfrags;
3170 }
3171
3172 static void
3173 jfreeblk_write(jfreeblk, jseg, data)
3174         struct jfreeblk *jfreeblk;
3175         struct jseg *jseg;
3176         uint8_t *data;
3177 {
3178         struct jblkrec *rec;
3179
3180         jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3181         rec = (struct jblkrec *)data;
3182         rec->jb_op = JOP_FREEBLK;
3183         rec->jb_ino = jfreeblk->jf_ino;
3184         rec->jb_blkno = jfreeblk->jf_blkno;
3185         rec->jb_lbn = jfreeblk->jf_lbn;
3186         rec->jb_frags = jfreeblk->jf_frags;
3187         rec->jb_oldfrags = 0;
3188 }
3189
3190 static void
3191 jfreefrag_write(jfreefrag, jseg, data)
3192         struct jfreefrag *jfreefrag;
3193         struct jseg *jseg;
3194         uint8_t *data;
3195 {
3196         struct jblkrec *rec;
3197
3198         jfreefrag->fr_jsegdep->jd_seg = jseg;
3199         rec = (struct jblkrec *)data;
3200         rec->jb_op = JOP_FREEBLK;
3201         rec->jb_ino = jfreefrag->fr_ino;
3202         rec->jb_blkno = jfreefrag->fr_blkno;
3203         rec->jb_lbn = jfreefrag->fr_lbn;
3204         rec->jb_frags = jfreefrag->fr_frags;
3205         rec->jb_oldfrags = 0;
3206 }
3207
3208 static void
3209 jtrunc_write(jtrunc, jseg, data)
3210         struct jtrunc *jtrunc;
3211         struct jseg *jseg;
3212         uint8_t *data;
3213 {
3214         struct jtrncrec *rec;
3215
3216         jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3217         rec = (struct jtrncrec *)data;
3218         rec->jt_op = JOP_TRUNC;
3219         rec->jt_ino = jtrunc->jt_ino;
3220         rec->jt_size = jtrunc->jt_size;
3221         rec->jt_extsize = jtrunc->jt_extsize;
3222 }
3223
3224 static void
3225 jfsync_write(jfsync, jseg, data)
3226         struct jfsync *jfsync;
3227         struct jseg *jseg;
3228         uint8_t *data;
3229 {
3230         struct jtrncrec *rec;
3231
3232         rec = (struct jtrncrec *)data;
3233         rec->jt_op = JOP_SYNC;
3234         rec->jt_ino = jfsync->jfs_ino;
3235         rec->jt_size = jfsync->jfs_size;
3236         rec->jt_extsize = jfsync->jfs_extsize;
3237 }
3238
3239 static void
3240 softdep_flushjournal(mp)
3241         struct mount *mp;
3242 {
3243         struct jblocks *jblocks;
3244         struct ufsmount *ump;
3245
3246         if (MOUNTEDSUJ(mp) == 0)
3247                 return;
3248         ump = VFSTOUFS(mp);
3249         jblocks = ump->softdep_jblocks;
3250         ACQUIRE_LOCK(ump);
3251         while (ump->softdep_on_journal) {
3252                 jblocks->jb_needseg = 1;
3253                 softdep_process_journal(mp, NULL, MNT_WAIT);
3254         }
3255         FREE_LOCK(ump);
3256 }
3257
3258 static void softdep_synchronize_completed(struct bio *);
3259 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3260
3261 static void
3262 softdep_synchronize_completed(bp)
3263         struct bio *bp;
3264 {
3265         struct jseg *oldest;
3266         struct jseg *jseg;
3267         struct ufsmount *ump;
3268
3269         /*
3270          * caller1 marks the last segment written before we issued the
3271          * synchronize cache.
3272          */
3273         jseg = bp->bio_caller1;
3274         if (jseg == NULL) {
3275                 g_destroy_bio(bp);
3276                 return;
3277         }
3278         ump = VFSTOUFS(jseg->js_list.wk_mp);
3279         ACQUIRE_LOCK(ump);
3280         oldest = NULL;
3281         /*
3282          * Mark all the journal entries waiting on the synchronize cache
3283          * as completed so they may continue on.
3284          */
3285         while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3286                 jseg->js_state |= COMPLETE;
3287                 oldest = jseg;
3288                 jseg = TAILQ_PREV(jseg, jseglst, js_next);
3289         }
3290         /*
3291          * Restart deferred journal entry processing from the oldest
3292          * completed jseg.
3293          */
3294         if (oldest)
3295                 complete_jsegs(oldest);
3296
3297         FREE_LOCK(ump);
3298         g_destroy_bio(bp);
3299 }
3300
3301 /*
3302  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3303  * barriers.  The journal must be written prior to any blocks that depend
3304  * on it and the journal can not be released until the blocks have be
3305  * written.  This code handles both barriers simultaneously.
3306  */
3307 static void
3308 softdep_synchronize(bp, ump, caller1)
3309         struct bio *bp;
3310         struct ufsmount *ump;
3311         void *caller1;
3312 {
3313
3314         bp->bio_cmd = BIO_FLUSH;
3315         bp->bio_flags |= BIO_ORDERED;
3316         bp->bio_data = NULL;
3317         bp->bio_offset = ump->um_cp->provider->mediasize;
3318         bp->bio_length = 0;
3319         bp->bio_done = softdep_synchronize_completed;
3320         bp->bio_caller1 = caller1;
3321         g_io_request(bp,
3322             (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3323 }
3324
3325 /*
3326  * Flush some journal records to disk.
3327  */
3328 static void
3329 softdep_process_journal(mp, needwk, flags)
3330         struct mount *mp;
3331         struct worklist *needwk;
3332         int flags;
3333 {
3334         struct jblocks *jblocks;
3335         struct ufsmount *ump;
3336         struct worklist *wk;
3337         struct jseg *jseg;
3338         struct buf *bp;
3339         struct bio *bio;
3340         uint8_t *data;
3341         struct fs *fs;
3342         int shouldflush;
3343         int segwritten;
3344         int jrecmin;    /* Minimum records per block. */
3345         int jrecmax;    /* Maximum records per block. */
3346         int size;
3347         int cnt;
3348         int off;
3349         int devbsize;
3350
3351         if (MOUNTEDSUJ(mp) == 0)
3352                 return;
3353         shouldflush = softdep_flushcache;
3354         bio = NULL;
3355         jseg = NULL;
3356         ump = VFSTOUFS(mp);
3357         LOCK_OWNED(ump);
3358         fs = ump->um_fs;
3359         jblocks = ump->softdep_jblocks;
3360         devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3361         /*
3362          * We write anywhere between a disk block and fs block.  The upper
3363          * bound is picked to prevent buffer cache fragmentation and limit
3364          * processing time per I/O.
3365          */
3366         jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3367         jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3368         segwritten = 0;
3369         for (;;) {
3370                 cnt = ump->softdep_on_journal;
3371                 /*
3372                  * Criteria for writing a segment:
3373                  * 1) We have a full block.
3374                  * 2) We're called from jwait() and haven't found the
3375                  *    journal item yet.
3376                  * 3) Always write if needseg is set.
3377                  * 4) If we are called from process_worklist and have
3378                  *    not yet written anything we write a partial block
3379                  *    to enforce a 1 second maximum latency on journal
3380                  *    entries.
3381                  */
3382                 if (cnt < (jrecmax - 1) && needwk == NULL &&
3383                     jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3384                         break;
3385                 cnt++;
3386                 /*
3387                  * Verify some free journal space.  softdep_prealloc() should
3388                  * guarantee that we don't run out so this is indicative of
3389                  * a problem with the flow control.  Try to recover
3390                  * gracefully in any event.
3391                  */
3392                 while (jblocks->jb_free == 0) {
3393                         if (flags != MNT_WAIT)
3394                                 break;
3395                         printf("softdep: Out of journal space!\n");
3396                         softdep_speedup(ump);
3397                         msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3398                 }
3399                 FREE_LOCK(ump);
3400                 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3401                 workitem_alloc(&jseg->js_list, D_JSEG, mp);
3402                 LIST_INIT(&jseg->js_entries);
3403                 LIST_INIT(&jseg->js_indirs);
3404                 jseg->js_state = ATTACHED;
3405                 if (shouldflush == 0)
3406                         jseg->js_state |= COMPLETE;
3407                 else if (bio == NULL)
3408                         bio = g_alloc_bio();
3409                 jseg->js_jblocks = jblocks;
3410                 bp = geteblk(fs->fs_bsize, 0);
3411                 ACQUIRE_LOCK(ump);
3412                 /*
3413                  * If there was a race while we were allocating the block
3414                  * and jseg the entry we care about was likely written.
3415                  * We bail out in both the WAIT and NOWAIT case and assume
3416                  * the caller will loop if the entry it cares about is
3417                  * not written.
3418                  */
3419                 cnt = ump->softdep_on_journal;
3420                 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3421                         bp->b_flags |= B_INVAL | B_NOCACHE;
3422                         WORKITEM_FREE(jseg, D_JSEG);
3423                         FREE_LOCK(ump);
3424                         brelse(bp);
3425                         ACQUIRE_LOCK(ump);
3426                         break;
3427                 }
3428                 /*
3429                  * Calculate the disk block size required for the available
3430                  * records rounded to the min size.
3431                  */
3432                 if (cnt == 0)
3433                         size = devbsize;
3434                 else if (cnt < jrecmax)
3435                         size = howmany(cnt, jrecmin) * devbsize;
3436                 else
3437                         size = fs->fs_bsize;
3438                 /*
3439                  * Allocate a disk block for this journal data and account
3440                  * for truncation of the requested size if enough contiguous
3441                  * space was not available.
3442                  */
3443                 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3444                 bp->b_lblkno = bp->b_blkno;
3445                 bp->b_offset = bp->b_blkno * DEV_BSIZE;
3446                 bp->b_bcount = size;
3447                 bp->b_flags &= ~B_INVAL;
3448                 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3449                 /*
3450                  * Initialize our jseg with cnt records.  Assign the next
3451                  * sequence number to it and link it in-order.
3452                  */
3453                 cnt = MIN(cnt, (size / devbsize) * jrecmin);
3454                 jseg->js_buf = bp;
3455                 jseg->js_cnt = cnt;
3456                 jseg->js_refs = cnt + 1;        /* Self ref. */
3457                 jseg->js_size = size;
3458                 jseg->js_seq = jblocks->jb_nextseq++;
3459                 if (jblocks->jb_oldestseg == NULL)
3460                         jblocks->jb_oldestseg = jseg;
3461                 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3462                 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3463                 if (jblocks->jb_writeseg == NULL)
3464                         jblocks->jb_writeseg = jseg;
3465                 /*
3466                  * Start filling in records from the pending list.
3467                  */
3468                 data = bp->b_data;
3469                 off = 0;
3470
3471                 /*
3472                  * Always put a header on the first block.
3473                  * XXX As with below, there might not be a chance to get
3474                  * into the loop.  Ensure that something valid is written.
3475                  */
3476                 jseg_write(ump, jseg, data);
3477                 off += JREC_SIZE;
3478                 data = bp->b_data + off;
3479
3480                 /*
3481                  * XXX Something is wrong here.  There's no work to do,
3482                  * but we need to perform and I/O and allow it to complete
3483                  * anyways.
3484                  */
3485                 if (LIST_EMPTY(&ump->softdep_journal_pending))
3486                         stat_emptyjblocks++;
3487
3488                 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3489                     != NULL) {
3490                         if (cnt == 0)
3491                                 break;
3492                         /* Place a segment header on every device block. */
3493                         if ((off % devbsize) == 0) {
3494                                 jseg_write(ump, jseg, data);
3495                                 off += JREC_SIZE;
3496                                 data = bp->b_data + off;
3497                         }
3498                         if (wk == needwk)
3499                                 needwk = NULL;
3500                         remove_from_journal(wk);
3501                         wk->wk_state |= INPROGRESS;
3502                         WORKLIST_INSERT(&jseg->js_entries, wk);
3503                         switch (wk->wk_type) {
3504                         case D_JADDREF:
3505                                 jaddref_write(WK_JADDREF(wk), jseg, data);
3506                                 break;
3507                         case D_JREMREF:
3508                                 jremref_write(WK_JREMREF(wk), jseg, data);
3509                                 break;
3510                         case D_JMVREF:
3511                                 jmvref_write(WK_JMVREF(wk), jseg, data);
3512                                 break;
3513                         case D_JNEWBLK:
3514                                 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3515                                 break;
3516                         case D_JFREEBLK:
3517                                 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3518                                 break;
3519                         case D_JFREEFRAG:
3520                                 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3521                                 break;
3522                         case D_JTRUNC:
3523                                 jtrunc_write(WK_JTRUNC(wk), jseg, data);
3524                                 break;
3525                         case D_JFSYNC:
3526                                 jfsync_write(WK_JFSYNC(wk), jseg, data);
3527                                 break;
3528                         default:
3529                                 panic("process_journal: Unknown type %s",
3530                                     TYPENAME(wk->wk_type));
3531                                 /* NOTREACHED */
3532                         }
3533                         off += JREC_SIZE;
3534                         data = bp->b_data + off;
3535                         cnt--;
3536                 }
3537
3538                 /* Clear any remaining space so we don't leak kernel data */
3539                 if (size > off)
3540                         bzero(data, size - off);
3541
3542                 /*
3543                  * Write this one buffer and continue.
3544                  */
3545                 segwritten = 1;
3546                 jblocks->jb_needseg = 0;
3547                 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3548                 FREE_LOCK(ump);
3549                 pbgetvp(ump->um_devvp, bp);
3550                 /*
3551                  * We only do the blocking wait once we find the journal
3552                  * entry we're looking for.
3553                  */
3554                 if (needwk == NULL && flags == MNT_WAIT)
3555                         bwrite(bp);
3556                 else
3557                         bawrite(bp);
3558                 ACQUIRE_LOCK(ump);
3559         }
3560         /*
3561          * If we wrote a segment issue a synchronize cache so the journal
3562          * is reflected on disk before the data is written.  Since reclaiming
3563          * journal space also requires writing a journal record this
3564          * process also enforces a barrier before reclamation.
3565          */
3566         if (segwritten && shouldflush) {
3567                 softdep_synchronize(bio, ump,
3568                     TAILQ_LAST(&jblocks->jb_segs, jseglst));
3569         } else if (bio)
3570                 g_destroy_bio(bio);
3571         /*
3572          * If we've suspended the filesystem because we ran out of journal
3573          * space either try to sync it here to make some progress or
3574          * unsuspend it if we already have.
3575          */
3576         if (flags == 0 && jblocks->jb_suspended) {
3577                 if (journal_unsuspend(ump))
3578                         return;
3579                 FREE_LOCK(ump);
3580                 VFS_SYNC(mp, MNT_NOWAIT);
3581                 ffs_sbupdate(ump, MNT_WAIT, 0);
3582                 ACQUIRE_LOCK(ump);
3583         }
3584 }
3585
3586 /*
3587  * Complete a jseg, allowing all dependencies awaiting journal writes
3588  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3589  * structures so that the journal segment can be freed to reclaim space.
3590  */
3591 static void
3592 complete_jseg(jseg)
3593         struct jseg *jseg;
3594 {
3595         struct worklist *wk;
3596         struct jmvref *jmvref;
3597 #ifdef INVARIANTS
3598         int i = 0;
3599 #endif
3600
3601         while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3602                 WORKLIST_REMOVE(wk);
3603                 wk->wk_state &= ~INPROGRESS;
3604                 wk->wk_state |= COMPLETE;
3605                 KASSERT(i++ < jseg->js_cnt,
3606                     ("handle_written_jseg: overflow %d >= %d",
3607                     i - 1, jseg->js_cnt));
3608                 switch (wk->wk_type) {
3609                 case D_JADDREF:
3610                         handle_written_jaddref(WK_JADDREF(wk));
3611                         break;
3612                 case D_JREMREF:
3613                         handle_written_jremref(WK_JREMREF(wk));
3614                         break;
3615                 case D_JMVREF:
3616                         rele_jseg(jseg);        /* No jsegdep. */
3617                         jmvref = WK_JMVREF(wk);
3618                         LIST_REMOVE(jmvref, jm_deps);
3619                         if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3620                                 free_pagedep(jmvref->jm_pagedep);
3621                         WORKITEM_FREE(jmvref, D_JMVREF);
3622                         break;
3623                 case D_JNEWBLK:
3624                         handle_written_jnewblk(WK_JNEWBLK(wk));
3625                         break;
3626                 case D_JFREEBLK:
3627                         handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3628                         break;
3629                 case D_JTRUNC:
3630                         handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3631                         break;
3632                 case D_JFSYNC:
3633                         rele_jseg(jseg);        /* No jsegdep. */
3634                         WORKITEM_FREE(wk, D_JFSYNC);
3635                         break;
3636                 case D_JFREEFRAG:
3637                         handle_written_jfreefrag(WK_JFREEFRAG(wk));
3638                         break;
3639                 default:
3640                         panic("handle_written_jseg: Unknown type %s",
3641                             TYPENAME(wk->wk_type));
3642                         /* NOTREACHED */
3643                 }
3644         }
3645         /* Release the self reference so the structure may be freed. */
3646         rele_jseg(jseg);
3647 }
3648
3649 /*
3650  * Determine which jsegs are ready for completion processing.  Waits for
3651  * synchronize cache to complete as well as forcing in-order completion
3652  * of journal entries.
3653  */
3654 static void
3655 complete_jsegs(jseg)
3656         struct jseg *jseg;
3657 {
3658         struct jblocks *jblocks;
3659         struct jseg *jsegn;
3660
3661         jblocks = jseg->js_jblocks;
3662         /*
3663          * Don't allow out of order completions.  If this isn't the first
3664          * block wait for it to write before we're done.
3665          */
3666         if (jseg != jblocks->jb_writeseg)
3667                 return;
3668         /* Iterate through available jsegs processing their entries. */
3669         while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3670                 jblocks->jb_oldestwrseq = jseg->js_oldseq;
3671                 jsegn = TAILQ_NEXT(jseg, js_next);
3672                 complete_jseg(jseg);
3673                 jseg = jsegn;
3674         }
3675         jblocks->jb_writeseg = jseg;
3676         /*
3677          * Attempt to free jsegs now that oldestwrseq may have advanced.
3678          */
3679         free_jsegs(jblocks);
3680 }
3681
3682 /*
3683  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3684  * the final completions.
3685  */
3686 static void
3687 handle_written_jseg(jseg, bp)
3688         struct jseg *jseg;
3689         struct buf *bp;
3690 {
3691
3692         if (jseg->js_refs == 0)
3693                 panic("handle_written_jseg: No self-reference on %p", jseg);
3694         jseg->js_state |= DEPCOMPLETE;
3695         /*
3696          * We'll never need this buffer again, set flags so it will be
3697          * discarded.
3698          */
3699         bp->b_flags |= B_INVAL | B_NOCACHE;
3700         pbrelvp(bp);
3701         complete_jsegs(jseg);
3702 }
3703
3704 static inline struct jsegdep *
3705 inoref_jseg(inoref)
3706         struct inoref *inoref;
3707 {
3708         struct jsegdep *jsegdep;
3709
3710         jsegdep = inoref->if_jsegdep;
3711         inoref->if_jsegdep = NULL;
3712
3713         return (jsegdep);
3714 }
3715
3716 /*
3717  * Called once a jremref has made it to stable store.  The jremref is marked
3718  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3719  * for the jremref to complete will be awoken by free_jremref.
3720  */
3721 static void
3722 handle_written_jremref(jremref)
3723         struct jremref *jremref;
3724 {
3725         struct inodedep *inodedep;
3726         struct jsegdep *jsegdep;
3727         struct dirrem *dirrem;
3728
3729         /* Grab the jsegdep. */
3730         jsegdep = inoref_jseg(&jremref->jr_ref);
3731         /*
3732          * Remove us from the inoref list.
3733          */
3734         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3735             0, &inodedep) == 0)
3736                 panic("handle_written_jremref: Lost inodedep");
3737         TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3738         /*
3739          * Complete the dirrem.
3740          */
3741         dirrem = jremref->jr_dirrem;
3742         jremref->jr_dirrem = NULL;
3743         LIST_REMOVE(jremref, jr_deps);
3744         jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3745         jwork_insert(&dirrem->dm_jwork, jsegdep);
3746         if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3747             (dirrem->dm_state & COMPLETE) != 0)
3748                 add_to_worklist(&dirrem->dm_list, 0);
3749         free_jremref(jremref);
3750 }
3751
3752 /*
3753  * Called once a jaddref has made it to stable store.  The dependency is
3754  * marked complete and any dependent structures are added to the inode
3755  * bufwait list to be completed as soon as it is written.  If a bitmap write
3756  * depends on this entry we move the inode into the inodedephd of the
3757  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3758  */
3759 static void
3760 handle_written_jaddref(jaddref)
3761         struct jaddref *jaddref;
3762 {
3763         struct jsegdep *jsegdep;
3764         struct inodedep *inodedep;
3765         struct diradd *diradd;
3766         struct mkdir *mkdir;
3767
3768         /* Grab the jsegdep. */
3769         jsegdep = inoref_jseg(&jaddref->ja_ref);
3770         mkdir = NULL;
3771         diradd = NULL;
3772         if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3773             0, &inodedep) == 0)
3774                 panic("handle_written_jaddref: Lost inodedep.");
3775         if (jaddref->ja_diradd == NULL)
3776                 panic("handle_written_jaddref: No dependency");
3777         if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3778                 diradd = jaddref->ja_diradd;
3779                 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3780         } else if (jaddref->ja_state & MKDIR_PARENT) {
3781                 mkdir = jaddref->ja_mkdir;
3782                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3783         } else if (jaddref->ja_state & MKDIR_BODY)
3784                 mkdir = jaddref->ja_mkdir;
3785         else
3786                 panic("handle_written_jaddref: Unknown dependency %p",
3787                     jaddref->ja_diradd);
3788         jaddref->ja_diradd = NULL;      /* also clears ja_mkdir */
3789         /*
3790          * Remove us from the inode list.
3791          */
3792         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3793         /*
3794          * The mkdir may be waiting on the jaddref to clear before freeing.
3795          */
3796         if (mkdir) {
3797                 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3798                     ("handle_written_jaddref: Incorrect type for mkdir %s",
3799                     TYPENAME(mkdir->md_list.wk_type)));
3800                 mkdir->md_jaddref = NULL;
3801                 diradd = mkdir->md_diradd;
3802                 mkdir->md_state |= DEPCOMPLETE;
3803                 complete_mkdir(mkdir);
3804         }
3805         jwork_insert(&diradd->da_jwork, jsegdep);
3806         if (jaddref->ja_state & NEWBLOCK) {
3807                 inodedep->id_state |= ONDEPLIST;
3808                 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3809                     inodedep, id_deps);
3810         }
3811         free_jaddref(jaddref);
3812 }
3813
3814 /*
3815  * Called once a jnewblk journal is written.  The allocdirect or allocindir
3816  * is placed in the bmsafemap to await notification of a written bitmap.  If
3817  * the operation was canceled we add the segdep to the appropriate
3818  * dependency to free the journal space once the canceling operation
3819  * completes.
3820  */
3821 static void
3822 handle_written_jnewblk(jnewblk)
3823         struct jnewblk *jnewblk;
3824 {
3825         struct bmsafemap *bmsafemap;
3826         struct freefrag *freefrag;
3827         struct freework *freework;
3828         struct jsegdep *jsegdep;
3829         struct newblk *newblk;
3830
3831         /* Grab the jsegdep. */
3832         jsegdep = jnewblk->jn_jsegdep;
3833         jnewblk->jn_jsegdep = NULL;
3834         if (jnewblk->jn_dep == NULL)
3835                 panic("handle_written_jnewblk: No dependency for the segdep.");
3836         switch (jnewblk->jn_dep->wk_type) {
3837         case D_NEWBLK:
3838         case D_ALLOCDIRECT:
3839         case D_ALLOCINDIR:
3840                 /*
3841                  * Add the written block to the bmsafemap so it can
3842                  * be notified when the bitmap is on disk.
3843                  */
3844                 newblk = WK_NEWBLK(jnewblk->jn_dep);
3845                 newblk->nb_jnewblk = NULL;
3846                 if ((newblk->nb_state & GOINGAWAY) == 0) {
3847                         bmsafemap = newblk->nb_bmsafemap;
3848                         newblk->nb_state |= ONDEPLIST;
3849                         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3850                             nb_deps);
3851                 }
3852                 jwork_insert(&newblk->nb_jwork, jsegdep);
3853                 break;
3854         case D_FREEFRAG:
3855                 /*
3856                  * A newblock being removed by a freefrag when replaced by
3857                  * frag extension.
3858                  */
3859                 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3860                 freefrag->ff_jdep = NULL;
3861                 jwork_insert(&freefrag->ff_jwork, jsegdep);
3862                 break;
3863         case D_FREEWORK:
3864                 /*
3865                  * A direct block was removed by truncate.
3866                  */
3867                 freework = WK_FREEWORK(jnewblk->jn_dep);
3868                 freework->fw_jnewblk = NULL;
3869                 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3870                 break;
3871         default:
3872                 panic("handle_written_jnewblk: Unknown type %d.",
3873                     jnewblk->jn_dep->wk_type);
3874         }
3875         jnewblk->jn_dep = NULL;
3876         free_jnewblk(jnewblk);
3877 }
3878
3879 /*
3880  * Cancel a jfreefrag that won't be needed, probably due to colliding with
3881  * an in-flight allocation that has not yet been committed.  Divorce us
3882  * from the freefrag and mark it DEPCOMPLETE so that it may be added
3883  * to the worklist.
3884  */
3885 static void
3886 cancel_jfreefrag(jfreefrag)
3887         struct jfreefrag *jfreefrag;
3888 {
3889         struct freefrag *freefrag;
3890
3891         if (jfreefrag->fr_jsegdep) {
3892                 free_jsegdep(jfreefrag->fr_jsegdep);
3893                 jfreefrag->fr_jsegdep = NULL;
3894         }
3895         freefrag = jfreefrag->fr_freefrag;
3896         jfreefrag->fr_freefrag = NULL;
3897         free_jfreefrag(jfreefrag);
3898         freefrag->ff_state |= DEPCOMPLETE;
3899         CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3900 }
3901
3902 /*
3903  * Free a jfreefrag when the parent freefrag is rendered obsolete.
3904  */
3905 static void
3906 free_jfreefrag(jfreefrag)
3907         struct jfreefrag *jfreefrag;
3908 {
3909
3910         if (jfreefrag->fr_state & INPROGRESS)
3911                 WORKLIST_REMOVE(&jfreefrag->fr_list);
3912         else if (jfreefrag->fr_state & ONWORKLIST)
3913                 remove_from_journal(&jfreefrag->fr_list);
3914         if (jfreefrag->fr_freefrag != NULL)
3915                 panic("free_jfreefrag:  Still attached to a freefrag.");
3916         WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3917 }
3918
3919 /*
3920  * Called when the journal write for a jfreefrag completes.  The parent
3921  * freefrag is added to the worklist if this completes its dependencies.
3922  */
3923 static void
3924 handle_written_jfreefrag(jfreefrag)
3925         struct jfreefrag *jfreefrag;
3926 {
3927         struct jsegdep *jsegdep;
3928         struct freefrag *freefrag;
3929
3930         /* Grab the jsegdep. */
3931         jsegdep = jfreefrag->fr_jsegdep;
3932         jfreefrag->fr_jsegdep = NULL;
3933         freefrag = jfreefrag->fr_freefrag;
3934         if (freefrag == NULL)
3935                 panic("handle_written_jfreefrag: No freefrag.");
3936         freefrag->ff_state |= DEPCOMPLETE;
3937         freefrag->ff_jdep = NULL;
3938         jwork_insert(&freefrag->ff_jwork, jsegdep);
3939         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3940                 add_to_worklist(&freefrag->ff_list, 0);
3941         jfreefrag->fr_freefrag = NULL;
3942         free_jfreefrag(jfreefrag);
3943 }
3944
3945 /*
3946  * Called when the journal write for a jfreeblk completes.  The jfreeblk
3947  * is removed from the freeblks list of pending journal writes and the
3948  * jsegdep is moved to the freeblks jwork to be completed when all blocks
3949  * have been reclaimed.
3950  */
3951 static void
3952 handle_written_jblkdep(jblkdep)
3953         struct jblkdep *jblkdep;
3954 {
3955         struct freeblks *freeblks;
3956         struct jsegdep *jsegdep;
3957
3958         /* Grab the jsegdep. */
3959         jsegdep = jblkdep->jb_jsegdep;
3960         jblkdep->jb_jsegdep = NULL;
3961         freeblks = jblkdep->jb_freeblks;
3962         LIST_REMOVE(jblkdep, jb_deps);
3963         jwork_insert(&freeblks->fb_jwork, jsegdep);
3964         /*
3965          * If the freeblks is all journaled, we can add it to the worklist.
3966          */
3967         if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3968             (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3969                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3970
3971         free_jblkdep(jblkdep);
3972 }
3973
3974 static struct jsegdep *
3975 newjsegdep(struct worklist *wk)
3976 {
3977         struct jsegdep *jsegdep;
3978
3979         jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3980         workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3981         jsegdep->jd_seg = NULL;
3982
3983         return (jsegdep);
3984 }
3985
3986 static struct jmvref *
3987 newjmvref(dp, ino, oldoff, newoff)
3988         struct inode *dp;
3989         ino_t ino;
3990         off_t oldoff;
3991         off_t newoff;
3992 {
3993         struct jmvref *jmvref;
3994
3995         jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3996         workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3997         jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3998         jmvref->jm_parent = dp->i_number;
3999         jmvref->jm_ino = ino;
4000         jmvref->jm_oldoff = oldoff;
4001         jmvref->jm_newoff = newoff;
4002
4003         return (jmvref);
4004 }
4005
4006 /*
4007  * Allocate a new jremref that tracks the removal of ip from dp with the
4008  * directory entry offset of diroff.  Mark the entry as ATTACHED and
4009  * DEPCOMPLETE as we have all the information required for the journal write
4010  * and the directory has already been removed from the buffer.  The caller
4011  * is responsible for linking the jremref into the pagedep and adding it
4012  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4013  * a DOTDOT addition so handle_workitem_remove() can properly assign
4014  * the jsegdep when we're done.
4015  */
4016 static struct jremref *
4017 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4018     off_t diroff, nlink_t nlink)
4019 {
4020         struct jremref *jremref;
4021
4022         jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4023         workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
4024         jremref->jr_state = ATTACHED;
4025         newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4026            nlink, ip->i_mode);
4027         jremref->jr_dirrem = dirrem;
4028
4029         return (jremref);
4030 }
4031
4032 static inline void
4033 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4034     nlink_t nlink, uint16_t mode)
4035 {
4036
4037         inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4038         inoref->if_diroff = diroff;
4039         inoref->if_ino = ino;
4040         inoref->if_parent = parent;
4041         inoref->if_nlink = nlink;
4042         inoref->if_mode = mode;
4043 }
4044
4045 /*
4046  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4047  * directory offset may not be known until later.  The caller is responsible
4048  * adding the entry to the journal when this information is available.  nlink
4049  * should be the link count prior to the addition and mode is only required
4050  * to have the correct FMT.
4051  */
4052 static struct jaddref *
4053 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4054     uint16_t mode)
4055 {
4056         struct jaddref *jaddref;
4057
4058         jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4059         workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
4060         jaddref->ja_state = ATTACHED;
4061         jaddref->ja_mkdir = NULL;
4062         newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4063
4064         return (jaddref);
4065 }
4066
4067 /*
4068  * Create a new free dependency for a freework.  The caller is responsible
4069  * for adjusting the reference count when it has the lock held.  The freedep
4070  * will track an outstanding bitmap write that will ultimately clear the
4071  * freework to continue.
4072  */
4073 static struct freedep *
4074 newfreedep(struct freework *freework)
4075 {
4076         struct freedep *freedep;
4077
4078         freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4079         workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4080         freedep->fd_freework = freework;
4081
4082         return (freedep);
4083 }
4084
4085 /*
4086  * Free a freedep structure once the buffer it is linked to is written.  If
4087  * this is the last reference to the freework schedule it for completion.
4088  */
4089 static void
4090 free_freedep(freedep)
4091         struct freedep *freedep;
4092 {
4093         struct freework *freework;
4094
4095         freework = freedep->fd_freework;
4096         freework->fw_freeblks->fb_cgwait--;
4097         if (--freework->fw_ref == 0)
4098                 freework_enqueue(freework);
4099         WORKITEM_FREE(freedep, D_FREEDEP);
4100 }
4101
4102 /*
4103  * Allocate a new freework structure that may be a level in an indirect
4104  * when parent is not NULL or a top level block when it is.  The top level
4105  * freework structures are allocated without the per-filesystem lock held
4106  * and before the freeblks is visible outside of softdep_setup_freeblocks().
4107  */
4108 static struct freework *
4109 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4110         struct ufsmount *ump;
4111         struct freeblks *freeblks;
4112         struct freework *parent;
4113         ufs_lbn_t lbn;
4114         ufs2_daddr_t nb;
4115         int frags;
4116         int off;
4117         int journal;
4118 {
4119         struct freework *freework;
4120
4121         freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4122         workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4123         freework->fw_state = ATTACHED;
4124         freework->fw_jnewblk = NULL;
4125         freework->fw_freeblks = freeblks;
4126         freework->fw_parent = parent;
4127         freework->fw_lbn = lbn;
4128         freework->fw_blkno = nb;
4129         freework->fw_frags = frags;
4130         freework->fw_indir = NULL;
4131         freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
4132                 ? 0 : NINDIR(ump->um_fs) + 1;
4133         freework->fw_start = freework->fw_off = off;
4134         if (journal)
4135                 newjfreeblk(freeblks, lbn, nb, frags);
4136         if (parent == NULL) {
4137                 ACQUIRE_LOCK(ump);
4138                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4139                 freeblks->fb_ref++;
4140                 FREE_LOCK(ump);
4141         }
4142
4143         return (freework);
4144 }
4145
4146 /*
4147  * Eliminate a jfreeblk for a block that does not need journaling.
4148  */
4149 static void
4150 cancel_jfreeblk(freeblks, blkno)
4151         struct freeblks *freeblks;
4152         ufs2_daddr_t blkno;
4153 {
4154         struct jfreeblk *jfreeblk;
4155         struct jblkdep *jblkdep;
4156
4157         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4158                 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4159                         continue;
4160                 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4161                 if (jfreeblk->jf_blkno == blkno)
4162                         break;
4163         }
4164         if (jblkdep == NULL)
4165                 return;
4166         CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4167         free_jsegdep(jblkdep->jb_jsegdep);
4168         LIST_REMOVE(jblkdep, jb_deps);
4169         WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4170 }
4171
4172 /*
4173  * Allocate a new jfreeblk to journal top level block pointer when truncating
4174  * a file.  The caller must add this to the worklist when the per-filesystem
4175  * lock is held.
4176  */
4177 static struct jfreeblk *
4178 newjfreeblk(freeblks, lbn, blkno, frags)
4179         struct freeblks *freeblks;
4180         ufs_lbn_t lbn;
4181         ufs2_daddr_t blkno;
4182         int frags;
4183 {
4184         struct jfreeblk *jfreeblk;
4185
4186         jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4187         workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4188             freeblks->fb_list.wk_mp);
4189         jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4190         jfreeblk->jf_dep.jb_freeblks = freeblks;
4191         jfreeblk->jf_ino = freeblks->fb_inum;
4192         jfreeblk->jf_lbn = lbn;
4193         jfreeblk->jf_blkno = blkno;
4194         jfreeblk->jf_frags = frags;
4195         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4196
4197         return (jfreeblk);
4198 }
4199
4200 /*
4201  * The journal is only prepared to handle full-size block numbers, so we
4202  * have to adjust the record to reflect the change to a full-size block.
4203  * For example, suppose we have a block made up of fragments 8-15 and
4204  * want to free its last two fragments. We are given a request that says:
4205  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4206  * where frags are the number of fragments to free and oldfrags are the
4207  * number of fragments to keep. To block align it, we have to change it to
4208  * have a valid full-size blkno, so it becomes:
4209  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4210  */
4211 static void
4212 adjust_newfreework(freeblks, frag_offset)
4213         struct freeblks *freeblks;
4214         int frag_offset;
4215 {
4216         struct jfreeblk *jfreeblk;
4217
4218         KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4219             LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4220             ("adjust_newfreework: Missing freeblks dependency"));
4221
4222         jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4223         jfreeblk->jf_blkno -= frag_offset;
4224         jfreeblk->jf_frags += frag_offset;
4225 }
4226
4227 /*
4228  * Allocate a new jtrunc to track a partial truncation.
4229  */
4230 static struct jtrunc *
4231 newjtrunc(freeblks, size, extsize)
4232         struct freeblks *freeblks;
4233         off_t size;
4234         int extsize;
4235 {
4236         struct jtrunc *jtrunc;
4237
4238         jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4239         workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4240             freeblks->fb_list.wk_mp);
4241         jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4242         jtrunc->jt_dep.jb_freeblks = freeblks;
4243         jtrunc->jt_ino = freeblks->fb_inum;
4244         jtrunc->jt_size = size;
4245         jtrunc->jt_extsize = extsize;
4246         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4247
4248         return (jtrunc);
4249 }
4250
4251 /*
4252  * If we're canceling a new bitmap we have to search for another ref
4253  * to move into the bmsafemap dep.  This might be better expressed
4254  * with another structure.
4255  */
4256 static void
4257 move_newblock_dep(jaddref, inodedep)
4258         struct jaddref *jaddref;
4259         struct inodedep *inodedep;
4260 {
4261         struct inoref *inoref;
4262         struct jaddref *jaddrefn;
4263
4264         jaddrefn = NULL;
4265         for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4266             inoref = TAILQ_NEXT(inoref, if_deps)) {
4267                 if ((jaddref->ja_state & NEWBLOCK) &&
4268                     inoref->if_list.wk_type == D_JADDREF) {
4269                         jaddrefn = (struct jaddref *)inoref;
4270                         break;
4271                 }
4272         }
4273         if (jaddrefn == NULL)
4274                 return;
4275         jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4276         jaddrefn->ja_state |= jaddref->ja_state &
4277             (ATTACHED | UNDONE | NEWBLOCK);
4278         jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4279         jaddref->ja_state |= ATTACHED;
4280         LIST_REMOVE(jaddref, ja_bmdeps);
4281         LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4282             ja_bmdeps);
4283 }
4284
4285 /*
4286  * Cancel a jaddref either before it has been written or while it is being
4287  * written.  This happens when a link is removed before the add reaches
4288  * the disk.  The jaddref dependency is kept linked into the bmsafemap
4289  * and inode to prevent the link count or bitmap from reaching the disk
4290  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4291  * required.
4292  *
4293  * Returns 1 if the canceled addref requires journaling of the remove and
4294  * 0 otherwise.
4295  */
4296 static int
4297 cancel_jaddref(jaddref, inodedep, wkhd)
4298         struct jaddref *jaddref;
4299         struct inodedep *inodedep;
4300         struct workhead *wkhd;
4301 {
4302         struct inoref *inoref;
4303         struct jsegdep *jsegdep;
4304         int needsj;
4305
4306         KASSERT((jaddref->ja_state & COMPLETE) == 0,
4307             ("cancel_jaddref: Canceling complete jaddref"));
4308         if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4309                 needsj = 1;
4310         else
4311                 needsj = 0;
4312         if (inodedep == NULL)
4313                 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4314                     0, &inodedep) == 0)
4315                         panic("cancel_jaddref: Lost inodedep");
4316         /*
4317          * We must adjust the nlink of any reference operation that follows
4318          * us so that it is consistent with the in-memory reference.  This
4319          * ensures that inode nlink rollbacks always have the correct link.
4320          */
4321         if (needsj == 0) {
4322                 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4323                     inoref = TAILQ_NEXT(inoref, if_deps)) {
4324                         if (inoref->if_state & GOINGAWAY)
4325                                 break;
4326                         inoref->if_nlink--;
4327                 }
4328         }
4329         jsegdep = inoref_jseg(&jaddref->ja_ref);
4330         if (jaddref->ja_state & NEWBLOCK)
4331                 move_newblock_dep(jaddref, inodedep);
4332         wake_worklist(&jaddref->ja_list);
4333         jaddref->ja_mkdir = NULL;
4334         if (jaddref->ja_state & INPROGRESS) {
4335                 jaddref->ja_state &= ~INPROGRESS;
4336                 WORKLIST_REMOVE(&jaddref->ja_list);
4337                 jwork_insert(wkhd, jsegdep);
4338         } else {
4339                 free_jsegdep(jsegdep);
4340                 if (jaddref->ja_state & DEPCOMPLETE)
4341                         remove_from_journal(&jaddref->ja_list);
4342         }
4343         jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4344         /*
4345          * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4346          * can arrange for them to be freed with the bitmap.  Otherwise we
4347          * no longer need this addref attached to the inoreflst and it
4348          * will incorrectly adjust nlink if we leave it.
4349          */
4350         if ((jaddref->ja_state & NEWBLOCK) == 0) {
4351                 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4352                     if_deps);
4353                 jaddref->ja_state |= COMPLETE;
4354                 free_jaddref(jaddref);
4355                 return (needsj);
4356         }
4357         /*
4358          * Leave the head of the list for jsegdeps for fast merging.
4359          */
4360         if (LIST_FIRST(wkhd) != NULL) {
4361                 jaddref->ja_state |= ONWORKLIST;
4362                 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4363         } else
4364                 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4365
4366         return (needsj);
4367 }
4368
4369 /*
4370  * Attempt to free a jaddref structure when some work completes.  This
4371  * should only succeed once the entry is written and all dependencies have
4372  * been notified.
4373  */
4374 static void
4375 free_jaddref(jaddref)
4376         struct jaddref *jaddref;
4377 {
4378
4379         if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4380                 return;
4381         if (jaddref->ja_ref.if_jsegdep)
4382                 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4383                     jaddref, jaddref->ja_state);
4384         if (jaddref->ja_state & NEWBLOCK)
4385                 LIST_REMOVE(jaddref, ja_bmdeps);
4386         if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4387                 panic("free_jaddref: Bad state %p(0x%X)",
4388                     jaddref, jaddref->ja_state);
4389         if (jaddref->ja_mkdir != NULL)
4390                 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4391         WORKITEM_FREE(jaddref, D_JADDREF);
4392 }
4393
4394 /*
4395  * Free a jremref structure once it has been written or discarded.
4396  */
4397 static void
4398 free_jremref(jremref)
4399         struct jremref *jremref;
4400 {
4401
4402         if (jremref->jr_ref.if_jsegdep)
4403                 free_jsegdep(jremref->jr_ref.if_jsegdep);
4404         if (jremref->jr_state & INPROGRESS)
4405                 panic("free_jremref: IO still pending");
4406         WORKITEM_FREE(jremref, D_JREMREF);
4407 }
4408
4409 /*
4410  * Free a jnewblk structure.
4411  */
4412 static void
4413 free_jnewblk(jnewblk)
4414         struct jnewblk *jnewblk;
4415 {
4416
4417         if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4418                 return;
4419         LIST_REMOVE(jnewblk, jn_deps);
4420         if (jnewblk->jn_dep != NULL)
4421                 panic("free_jnewblk: Dependency still attached.");
4422         WORKITEM_FREE(jnewblk, D_JNEWBLK);
4423 }
4424
4425 /*
4426  * Cancel a jnewblk which has been been made redundant by frag extension.
4427  */
4428 static void
4429 cancel_jnewblk(jnewblk, wkhd)
4430         struct jnewblk *jnewblk;
4431         struct workhead *wkhd;
4432 {
4433         struct jsegdep *jsegdep;
4434
4435         CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4436         jsegdep = jnewblk->jn_jsegdep;
4437         if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4438                 panic("cancel_jnewblk: Invalid state");
4439         jnewblk->jn_jsegdep  = NULL;
4440         jnewblk->jn_dep = NULL;
4441         jnewblk->jn_state |= GOINGAWAY;
4442         if (jnewblk->jn_state & INPROGRESS) {
4443                 jnewblk->jn_state &= ~INPROGRESS;
4444                 WORKLIST_REMOVE(&jnewblk->jn_list);
4445                 jwork_insert(wkhd, jsegdep);
4446         } else {
4447                 free_jsegdep(jsegdep);
4448                 remove_from_journal(&jnewblk->jn_list);
4449         }
4450         wake_worklist(&jnewblk->jn_list);
4451         WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4452 }
4453
4454 static void
4455 free_jblkdep(jblkdep)
4456         struct jblkdep *jblkdep;
4457 {
4458
4459         if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4460                 WORKITEM_FREE(jblkdep, D_JFREEBLK);
4461         else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4462                 WORKITEM_FREE(jblkdep, D_JTRUNC);
4463         else
4464                 panic("free_jblkdep: Unexpected type %s",
4465                     TYPENAME(jblkdep->jb_list.wk_type));
4466 }
4467
4468 /*
4469  * Free a single jseg once it is no longer referenced in memory or on
4470  * disk.  Reclaim journal blocks and dependencies waiting for the segment
4471  * to disappear.
4472  */
4473 static void
4474 free_jseg(jseg, jblocks)
4475         struct jseg *jseg;
4476         struct jblocks *jblocks;
4477 {
4478         struct freework *freework;
4479
4480         /*
4481          * Free freework structures that were lingering to indicate freed
4482          * indirect blocks that forced journal write ordering on reallocate.
4483          */
4484         while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4485                 indirblk_remove(freework);
4486         if (jblocks->jb_oldestseg == jseg)
4487                 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4488         TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4489         jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4490         KASSERT(LIST_EMPTY(&jseg->js_entries),
4491             ("free_jseg: Freed jseg has valid entries."));
4492         WORKITEM_FREE(jseg, D_JSEG);
4493 }
4494
4495 /*
4496  * Free all jsegs that meet the criteria for being reclaimed and update
4497  * oldestseg.
4498  */
4499 static void
4500 free_jsegs(jblocks)
4501         struct jblocks *jblocks;
4502 {
4503         struct jseg *jseg;
4504
4505         /*
4506          * Free only those jsegs which have none allocated before them to
4507          * preserve the journal space ordering.
4508          */
4509         while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4510                 /*
4511                  * Only reclaim space when nothing depends on this journal
4512                  * set and another set has written that it is no longer
4513                  * valid.
4514                  */
4515                 if (jseg->js_refs != 0) {
4516                         jblocks->jb_oldestseg = jseg;
4517                         return;
4518                 }
4519                 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4520                         break;
4521                 if (jseg->js_seq > jblocks->jb_oldestwrseq)
4522                         break;
4523                 /*
4524                  * We can free jsegs that didn't write entries when
4525                  * oldestwrseq == js_seq.
4526                  */
4527                 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4528                     jseg->js_cnt != 0)
4529                         break;
4530                 free_jseg(jseg, jblocks);
4531         }
4532         /*
4533          * If we exited the loop above we still must discover the
4534          * oldest valid segment.
4535          */
4536         if (jseg)
4537                 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4538                      jseg = TAILQ_NEXT(jseg, js_next))
4539                         if (jseg->js_refs != 0)
4540                                 break;
4541         jblocks->jb_oldestseg = jseg;
4542         /*
4543          * The journal has no valid records but some jsegs may still be
4544          * waiting on oldestwrseq to advance.  We force a small record
4545          * out to permit these lingering records to be reclaimed.
4546          */
4547         if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4548                 jblocks->jb_needseg = 1;
4549 }
4550
4551 /*
4552  * Release one reference to a jseg and free it if the count reaches 0.  This
4553  * should eventually reclaim journal space as well.
4554  */
4555 static void
4556 rele_jseg(jseg)
4557         struct jseg *jseg;
4558 {
4559
4560         KASSERT(jseg->js_refs > 0,
4561             ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4562         if (--jseg->js_refs != 0)
4563                 return;
4564         free_jsegs(jseg->js_jblocks);
4565 }
4566
4567 /*
4568  * Release a jsegdep and decrement the jseg count.
4569  */
4570 static void
4571 free_jsegdep(jsegdep)
4572         struct jsegdep *jsegdep;
4573 {
4574
4575         if (jsegdep->jd_seg)
4576                 rele_jseg(jsegdep->jd_seg);
4577         WORKITEM_FREE(jsegdep, D_JSEGDEP);
4578 }
4579
4580 /*
4581  * Wait for a journal item to make it to disk.  Initiate journal processing
4582  * if required.
4583  */
4584 static int
4585 jwait(wk, waitfor)
4586         struct worklist *wk;
4587         int waitfor;
4588 {
4589
4590         LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4591         /*
4592          * Blocking journal waits cause slow synchronous behavior.  Record
4593          * stats on the frequency of these blocking operations.
4594          */
4595         if (waitfor == MNT_WAIT) {
4596                 stat_journal_wait++;
4597                 switch (wk->wk_type) {
4598                 case D_JREMREF:
4599                 case D_JMVREF:
4600                         stat_jwait_filepage++;
4601                         break;
4602                 case D_JTRUNC:
4603                 case D_JFREEBLK:
4604                         stat_jwait_freeblks++;
4605                         break;
4606                 case D_JNEWBLK:
4607                         stat_jwait_newblk++;
4608                         break;
4609                 case D_JADDREF:
4610                         stat_jwait_inode++;
4611                         break;
4612                 default:
4613                         break;
4614                 }
4615         }
4616         /*
4617          * If IO has not started we process the journal.  We can't mark the
4618          * worklist item as IOWAITING because we drop the lock while
4619          * processing the journal and the worklist entry may be freed after
4620          * this point.  The caller may call back in and re-issue the request.
4621          */
4622         if ((wk->wk_state & INPROGRESS) == 0) {
4623                 softdep_process_journal(wk->wk_mp, wk, waitfor);
4624                 if (waitfor != MNT_WAIT)
4625                         return (EBUSY);
4626                 return (0);
4627         }
4628         if (waitfor != MNT_WAIT)
4629                 return (EBUSY);
4630         wait_worklist(wk, "jwait");
4631         return (0);
4632 }
4633
4634 /*
4635  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4636  * appropriate.  This is a convenience function to reduce duplicate code
4637  * for the setup and revert functions below.
4638  */
4639 static struct inodedep *
4640 inodedep_lookup_ip(ip)
4641         struct inode *ip;
4642 {
4643         struct inodedep *inodedep;
4644
4645         KASSERT(ip->i_nlink >= ip->i_effnlink,
4646             ("inodedep_lookup_ip: bad delta"));
4647         (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
4648             &inodedep);
4649         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4650         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4651
4652         return (inodedep);
4653 }
4654
4655 /*
4656  * Called prior to creating a new inode and linking it to a directory.  The
4657  * jaddref structure must already be allocated by softdep_setup_inomapdep
4658  * and it is discovered here so we can initialize the mode and update
4659  * nlinkdelta.
4660  */
4661 void
4662 softdep_setup_create(dp, ip)
4663         struct inode *dp;
4664         struct inode *ip;
4665 {
4666         struct inodedep *inodedep;
4667         struct jaddref *jaddref;
4668         struct vnode *dvp;
4669
4670         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4671             ("softdep_setup_create called on non-softdep filesystem"));
4672         KASSERT(ip->i_nlink == 1,
4673             ("softdep_setup_create: Invalid link count."));
4674         dvp = ITOV(dp);
4675         ACQUIRE_LOCK(dp->i_ump);
4676         inodedep = inodedep_lookup_ip(ip);
4677         if (DOINGSUJ(dvp)) {
4678                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4679                     inoreflst);
4680                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4681                     ("softdep_setup_create: No addref structure present."));
4682         }
4683         softdep_prelink(dvp, NULL);
4684         FREE_LOCK(dp->i_ump);
4685 }
4686
4687 /*
4688  * Create a jaddref structure to track the addition of a DOTDOT link when
4689  * we are reparenting an inode as part of a rename.  This jaddref will be
4690  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4691  * non-journaling softdep.
4692  */
4693 void
4694 softdep_setup_dotdot_link(dp, ip)
4695         struct inode *dp;
4696         struct inode *ip;
4697 {
4698         struct inodedep *inodedep;
4699         struct jaddref *jaddref;
4700         struct vnode *dvp;
4701
4702         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4703             ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4704         dvp = ITOV(dp);
4705         jaddref = NULL;
4706         /*
4707          * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4708          * is used as a normal link would be.
4709          */
4710         if (DOINGSUJ(dvp))
4711                 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4712                     dp->i_effnlink - 1, dp->i_mode);
4713         ACQUIRE_LOCK(dp->i_ump);
4714         inodedep = inodedep_lookup_ip(dp);
4715         if (jaddref)
4716                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4717                     if_deps);
4718         softdep_prelink(dvp, ITOV(ip));
4719         FREE_LOCK(dp->i_ump);
4720 }
4721
4722 /*
4723  * Create a jaddref structure to track a new link to an inode.  The directory
4724  * offset is not known until softdep_setup_directory_add or
4725  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4726  * softdep.
4727  */
4728 void
4729 softdep_setup_link(dp, ip)
4730         struct inode *dp;
4731         struct inode *ip;
4732 {
4733         struct inodedep *inodedep;
4734         struct jaddref *jaddref;
4735         struct vnode *dvp;
4736
4737         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4738             ("softdep_setup_link called on non-softdep filesystem"));
4739         dvp = ITOV(dp);
4740         jaddref = NULL;
4741         if (DOINGSUJ(dvp))
4742                 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4743                     ip->i_mode);
4744         ACQUIRE_LOCK(dp->i_ump);
4745         inodedep = inodedep_lookup_ip(ip);
4746         if (jaddref)
4747                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4748                     if_deps);
4749         softdep_prelink(dvp, ITOV(ip));
4750         FREE_LOCK(dp->i_ump);
4751 }
4752
4753 /*
4754  * Called to create the jaddref structures to track . and .. references as
4755  * well as lookup and further initialize the incomplete jaddref created
4756  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4757  * nlinkdelta for non-journaling softdep.
4758  */
4759 void
4760 softdep_setup_mkdir(dp, ip)
4761         struct inode *dp;
4762         struct inode *ip;
4763 {
4764         struct inodedep *inodedep;
4765         struct jaddref *dotdotaddref;
4766         struct jaddref *dotaddref;
4767         struct jaddref *jaddref;
4768         struct vnode *dvp;
4769
4770         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4771             ("softdep_setup_mkdir called on non-softdep filesystem"));
4772         dvp = ITOV(dp);
4773         dotaddref = dotdotaddref = NULL;
4774         if (DOINGSUJ(dvp)) {
4775                 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4776                     ip->i_mode);
4777                 dotaddref->ja_state |= MKDIR_BODY;
4778                 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4779                     dp->i_effnlink - 1, dp->i_mode);
4780                 dotdotaddref->ja_state |= MKDIR_PARENT;
4781         }
4782         ACQUIRE_LOCK(dp->i_ump);
4783         inodedep = inodedep_lookup_ip(ip);
4784         if (DOINGSUJ(dvp)) {
4785                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4786                     inoreflst);
4787                 KASSERT(jaddref != NULL,
4788                     ("softdep_setup_mkdir: No addref structure present."));
4789                 KASSERT(jaddref->ja_parent == dp->i_number,
4790                     ("softdep_setup_mkdir: bad parent %ju",
4791                     (uintmax_t)jaddref->ja_parent));
4792                 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4793                     if_deps);
4794         }
4795         inodedep = inodedep_lookup_ip(dp);
4796         if (DOINGSUJ(dvp))
4797                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4798                     &dotdotaddref->ja_ref, if_deps);
4799         softdep_prelink(ITOV(dp), NULL);
4800         FREE_LOCK(dp->i_ump);
4801 }
4802
4803 /*
4804  * Called to track nlinkdelta of the inode and parent directories prior to
4805  * unlinking a directory.
4806  */
4807 void
4808 softdep_setup_rmdir(dp, ip)
4809         struct inode *dp;
4810         struct inode *ip;
4811 {
4812         struct vnode *dvp;
4813
4814         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4815             ("softdep_setup_rmdir called on non-softdep filesystem"));
4816         dvp = ITOV(dp);
4817         ACQUIRE_LOCK(dp->i_ump);
4818         (void) inodedep_lookup_ip(ip);
4819         (void) inodedep_lookup_ip(dp);
4820         softdep_prelink(dvp, ITOV(ip));
4821         FREE_LOCK(dp->i_ump);
4822 }
4823
4824 /*
4825  * Called to track nlinkdelta of the inode and parent directories prior to
4826  * unlink.
4827  */
4828 void
4829 softdep_setup_unlink(dp, ip)
4830         struct inode *dp;
4831         struct inode *ip;
4832 {
4833         struct vnode *dvp;
4834
4835         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4836             ("softdep_setup_unlink called on non-softdep filesystem"));
4837         dvp = ITOV(dp);
4838         ACQUIRE_LOCK(dp->i_ump);
4839         (void) inodedep_lookup_ip(ip);
4840         (void) inodedep_lookup_ip(dp);
4841         softdep_prelink(dvp, ITOV(ip));
4842         FREE_LOCK(dp->i_ump);
4843 }
4844
4845 /*
4846  * Called to release the journal structures created by a failed non-directory
4847  * creation.  Adjusts nlinkdelta for non-journaling softdep.
4848  */
4849 void
4850 softdep_revert_create(dp, ip)
4851         struct inode *dp;
4852         struct inode *ip;
4853 {
4854         struct inodedep *inodedep;
4855         struct jaddref *jaddref;
4856         struct vnode *dvp;
4857
4858         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4859             ("softdep_revert_create called on non-softdep filesystem"));
4860         dvp = ITOV(dp);
4861         ACQUIRE_LOCK(dp->i_ump);
4862         inodedep = inodedep_lookup_ip(ip);
4863         if (DOINGSUJ(dvp)) {
4864                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4865                     inoreflst);
4866                 KASSERT(jaddref->ja_parent == dp->i_number,
4867                     ("softdep_revert_create: addref parent mismatch"));
4868                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4869         }
4870         FREE_LOCK(dp->i_ump);
4871 }
4872
4873 /*
4874  * Called to release the journal structures created by a failed link
4875  * addition.  Adjusts nlinkdelta for non-journaling softdep.
4876  */
4877 void
4878 softdep_revert_link(dp, ip)
4879         struct inode *dp;
4880         struct inode *ip;
4881 {
4882         struct inodedep *inodedep;
4883         struct jaddref *jaddref;
4884         struct vnode *dvp;
4885
4886         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4887             ("softdep_revert_link called on non-softdep filesystem"));
4888         dvp = ITOV(dp);
4889         ACQUIRE_LOCK(dp->i_ump);
4890         inodedep = inodedep_lookup_ip(ip);
4891         if (DOINGSUJ(dvp)) {
4892                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4893                     inoreflst);
4894                 KASSERT(jaddref->ja_parent == dp->i_number,
4895                     ("softdep_revert_link: addref parent mismatch"));
4896                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4897         }
4898         FREE_LOCK(dp->i_ump);
4899 }
4900
4901 /*
4902  * Called to release the journal structures created by a failed mkdir
4903  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4904  */
4905 void
4906 softdep_revert_mkdir(dp, ip)
4907         struct inode *dp;
4908         struct inode *ip;
4909 {
4910         struct inodedep *inodedep;
4911         struct jaddref *jaddref;
4912         struct jaddref *dotaddref;
4913         struct vnode *dvp;
4914
4915         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4916             ("softdep_revert_mkdir called on non-softdep filesystem"));
4917         dvp = ITOV(dp);
4918
4919         ACQUIRE_LOCK(dp->i_ump);
4920         inodedep = inodedep_lookup_ip(dp);
4921         if (DOINGSUJ(dvp)) {
4922                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4923                     inoreflst);
4924                 KASSERT(jaddref->ja_parent == ip->i_number,
4925                     ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4926                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4927         }
4928         inodedep = inodedep_lookup_ip(ip);
4929         if (DOINGSUJ(dvp)) {
4930                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4931                     inoreflst);
4932                 KASSERT(jaddref->ja_parent == dp->i_number,
4933                     ("softdep_revert_mkdir: addref parent mismatch"));
4934                 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4935                     inoreflst, if_deps);
4936                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4937                 KASSERT(dotaddref->ja_parent == ip->i_number,
4938                     ("softdep_revert_mkdir: dot addref parent mismatch"));
4939                 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4940         }
4941         FREE_LOCK(dp->i_ump);
4942 }
4943
4944 /*
4945  * Called to correct nlinkdelta after a failed rmdir.
4946  */
4947 void
4948 softdep_revert_rmdir(dp, ip)
4949         struct inode *dp;
4950         struct inode *ip;
4951 {
4952
4953         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4954             ("softdep_revert_rmdir called on non-softdep filesystem"));
4955         ACQUIRE_LOCK(dp->i_ump);
4956         (void) inodedep_lookup_ip(ip);
4957         (void) inodedep_lookup_ip(dp);
4958         FREE_LOCK(dp->i_ump);
4959 }
4960
4961 /*
4962  * Protecting the freemaps (or bitmaps).
4963  *
4964  * To eliminate the need to execute fsck before mounting a filesystem
4965  * after a power failure, one must (conservatively) guarantee that the
4966  * on-disk copy of the bitmaps never indicate that a live inode or block is
4967  * free.  So, when a block or inode is allocated, the bitmap should be
4968  * updated (on disk) before any new pointers.  When a block or inode is
4969  * freed, the bitmap should not be updated until all pointers have been
4970  * reset.  The latter dependency is handled by the delayed de-allocation
4971  * approach described below for block and inode de-allocation.  The former
4972  * dependency is handled by calling the following procedure when a block or
4973  * inode is allocated. When an inode is allocated an "inodedep" is created
4974  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4975  * Each "inodedep" is also inserted into the hash indexing structure so
4976  * that any additional link additions can be made dependent on the inode
4977  * allocation.
4978  *
4979  * The ufs filesystem maintains a number of free block counts (e.g., per
4980  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4981  * in addition to the bitmaps.  These counts are used to improve efficiency
4982  * during allocation and therefore must be consistent with the bitmaps.
4983  * There is no convenient way to guarantee post-crash consistency of these
4984  * counts with simple update ordering, for two main reasons: (1) The counts
4985  * and bitmaps for a single cylinder group block are not in the same disk
4986  * sector.  If a disk write is interrupted (e.g., by power failure), one may
4987  * be written and the other not.  (2) Some of the counts are located in the
4988  * superblock rather than the cylinder group block. So, we focus our soft
4989  * updates implementation on protecting the bitmaps. When mounting a
4990  * filesystem, we recompute the auxiliary counts from the bitmaps.
4991  */
4992
4993 /*
4994  * Called just after updating the cylinder group block to allocate an inode.
4995  */
4996 void
4997 softdep_setup_inomapdep(bp, ip, newinum, mode)
4998         struct buf *bp;         /* buffer for cylgroup block with inode map */
4999         struct inode *ip;       /* inode related to allocation */
5000         ino_t newinum;          /* new inode number being allocated */
5001         int mode;
5002 {
5003         struct inodedep *inodedep;
5004         struct bmsafemap *bmsafemap;
5005         struct jaddref *jaddref;
5006         struct mount *mp;
5007         struct fs *fs;
5008
5009         mp = UFSTOVFS(ip->i_ump);
5010         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5011             ("softdep_setup_inomapdep called on non-softdep filesystem"));
5012         fs = ip->i_ump->um_fs;
5013         jaddref = NULL;
5014
5015         /*
5016          * Allocate the journal reference add structure so that the bitmap
5017          * can be dependent on it.
5018          */
5019         if (MOUNTEDSUJ(mp)) {
5020                 jaddref = newjaddref(ip, newinum, 0, 0, mode);
5021                 jaddref->ja_state |= NEWBLOCK;
5022         }
5023
5024         /*
5025          * Create a dependency for the newly allocated inode.
5026          * Panic if it already exists as something is seriously wrong.
5027          * Otherwise add it to the dependency list for the buffer holding
5028          * the cylinder group map from which it was allocated.
5029          *
5030          * We have to preallocate a bmsafemap entry in case it is needed
5031          * in bmsafemap_lookup since once we allocate the inodedep, we
5032          * have to finish initializing it before we can FREE_LOCK().
5033          * By preallocating, we avoid FREE_LOCK() while doing a malloc
5034          * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5035          * creating the inodedep as it can be freed during the time
5036          * that we FREE_LOCK() while allocating the inodedep. We must
5037          * call workitem_alloc() before entering the locked section as
5038          * it also acquires the lock and we must avoid trying doing so
5039          * recursively.
5040          */
5041         bmsafemap = malloc(sizeof(struct bmsafemap),
5042             M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5043         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5044         ACQUIRE_LOCK(ip->i_ump);
5045         if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5046                 panic("softdep_setup_inomapdep: dependency %p for new"
5047                     "inode already exists", inodedep);
5048         bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5049         if (jaddref) {
5050                 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5051                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5052                     if_deps);
5053         } else {
5054                 inodedep->id_state |= ONDEPLIST;
5055                 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5056         }
5057         inodedep->id_bmsafemap = bmsafemap;
5058         inodedep->id_state &= ~DEPCOMPLETE;
5059         FREE_LOCK(ip->i_ump);
5060 }
5061
5062 /*
5063  * Called just after updating the cylinder group block to
5064  * allocate block or fragment.
5065  */
5066 void
5067 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5068         struct buf *bp;         /* buffer for cylgroup block with block map */
5069         struct mount *mp;       /* filesystem doing allocation */
5070         ufs2_daddr_t newblkno;  /* number of newly allocated block */
5071         int frags;              /* Number of fragments. */
5072         int oldfrags;           /* Previous number of fragments for extend. */
5073 {
5074         struct newblk *newblk;
5075         struct bmsafemap *bmsafemap;
5076         struct jnewblk *jnewblk;
5077         struct ufsmount *ump;
5078         struct fs *fs;
5079
5080         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5081             ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5082         ump = VFSTOUFS(mp);
5083         fs = ump->um_fs;
5084         jnewblk = NULL;
5085         /*
5086          * Create a dependency for the newly allocated block.
5087          * Add it to the dependency list for the buffer holding
5088          * the cylinder group map from which it was allocated.
5089          */
5090         if (MOUNTEDSUJ(mp)) {
5091                 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5092                 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5093                 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5094                 jnewblk->jn_state = ATTACHED;
5095                 jnewblk->jn_blkno = newblkno;
5096                 jnewblk->jn_frags = frags;
5097                 jnewblk->jn_oldfrags = oldfrags;
5098 #ifdef SUJ_DEBUG
5099                 {
5100                         struct cg *cgp;
5101                         uint8_t *blksfree;
5102                         long bno;
5103                         int i;
5104
5105                         cgp = (struct cg *)bp->b_data;
5106                         blksfree = cg_blksfree(cgp);
5107                         bno = dtogd(fs, jnewblk->jn_blkno);
5108                         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5109                             i++) {
5110                                 if (isset(blksfree, bno + i))
5111                                         panic("softdep_setup_blkmapdep: "
5112                                             "free fragment %d from %d-%d "
5113                                             "state 0x%X dep %p", i,
5114                                             jnewblk->jn_oldfrags,
5115                                             jnewblk->jn_frags,
5116                                             jnewblk->jn_state,
5117                                             jnewblk->jn_dep);
5118                         }
5119                 }
5120 #endif
5121         }
5122
5123         CTR3(KTR_SUJ,
5124             "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5125             newblkno, frags, oldfrags);
5126         ACQUIRE_LOCK(ump);
5127         if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5128                 panic("softdep_setup_blkmapdep: found block");
5129         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5130             dtog(fs, newblkno), NULL);
5131         if (jnewblk) {
5132                 jnewblk->jn_dep = (struct worklist *)newblk;
5133                 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5134         } else {
5135                 newblk->nb_state |= ONDEPLIST;
5136                 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5137         }
5138         newblk->nb_bmsafemap = bmsafemap;
5139         newblk->nb_jnewblk = jnewblk;
5140         FREE_LOCK(ump);
5141 }
5142
5143 #define BMSAFEMAP_HASH(ump, cg) \
5144       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5145
5146 static int
5147 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5148         struct bmsafemap_hashhead *bmsafemaphd;
5149         int cg;
5150         struct bmsafemap **bmsafemapp;
5151 {
5152         struct bmsafemap *bmsafemap;
5153
5154         LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5155                 if (bmsafemap->sm_cg == cg)
5156                         break;
5157         if (bmsafemap) {
5158                 *bmsafemapp = bmsafemap;
5159                 return (1);
5160         }
5161         *bmsafemapp = NULL;
5162
5163         return (0);
5164 }
5165
5166 /*
5167  * Find the bmsafemap associated with a cylinder group buffer.
5168  * If none exists, create one. The buffer must be locked when
5169  * this routine is called and this routine must be called with
5170  * the softdep lock held. To avoid giving up the lock while
5171  * allocating a new bmsafemap, a preallocated bmsafemap may be
5172  * provided. If it is provided but not needed, it is freed.
5173  */
5174 static struct bmsafemap *
5175 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5176         struct mount *mp;
5177         struct buf *bp;
5178         int cg;
5179         struct bmsafemap *newbmsafemap;
5180 {
5181         struct bmsafemap_hashhead *bmsafemaphd;
5182         struct bmsafemap *bmsafemap, *collision;
5183         struct worklist *wk;
5184         struct ufsmount *ump;
5185
5186         ump = VFSTOUFS(mp);
5187         LOCK_OWNED(ump);
5188         KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5189         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5190                 if (wk->wk_type == D_BMSAFEMAP) {
5191                         if (newbmsafemap)
5192                                 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5193                         return (WK_BMSAFEMAP(wk));
5194                 }
5195         }
5196         bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5197         if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5198                 if (newbmsafemap)
5199                         WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5200                 return (bmsafemap);
5201         }
5202         if (newbmsafemap) {
5203                 bmsafemap = newbmsafemap;
5204         } else {
5205                 FREE_LOCK(ump);
5206                 bmsafemap = malloc(sizeof(struct bmsafemap),
5207                         M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5208                 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5209                 ACQUIRE_LOCK(ump);
5210         }
5211         bmsafemap->sm_buf = bp;
5212         LIST_INIT(&bmsafemap->sm_inodedephd);
5213         LIST_INIT(&bmsafemap->sm_inodedepwr);
5214         LIST_INIT(&bmsafemap->sm_newblkhd);
5215         LIST_INIT(&bmsafemap->sm_newblkwr);
5216         LIST_INIT(&bmsafemap->sm_jaddrefhd);
5217         LIST_INIT(&bmsafemap->sm_jnewblkhd);
5218         LIST_INIT(&bmsafemap->sm_freehd);
5219         LIST_INIT(&bmsafemap->sm_freewr);
5220         if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5221                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5222                 return (collision);
5223         }
5224         bmsafemap->sm_cg = cg;
5225         LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5226         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5227         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5228         return (bmsafemap);
5229 }
5230
5231 /*
5232  * Direct block allocation dependencies.
5233  *
5234  * When a new block is allocated, the corresponding disk locations must be
5235  * initialized (with zeros or new data) before the on-disk inode points to
5236  * them.  Also, the freemap from which the block was allocated must be
5237  * updated (on disk) before the inode's pointer. These two dependencies are
5238  * independent of each other and are needed for all file blocks and indirect
5239  * blocks that are pointed to directly by the inode.  Just before the
5240  * "in-core" version of the inode is updated with a newly allocated block
5241  * number, a procedure (below) is called to setup allocation dependency
5242  * structures.  These structures are removed when the corresponding
5243  * dependencies are satisfied or when the block allocation becomes obsolete
5244  * (i.e., the file is deleted, the block is de-allocated, or the block is a
5245  * fragment that gets upgraded).  All of these cases are handled in
5246  * procedures described later.
5247  *
5248  * When a file extension causes a fragment to be upgraded, either to a larger
5249  * fragment or to a full block, the on-disk location may change (if the
5250  * previous fragment could not simply be extended). In this case, the old
5251  * fragment must be de-allocated, but not until after the inode's pointer has
5252  * been updated. In most cases, this is handled by later procedures, which
5253  * will construct a "freefrag" structure to be added to the workitem queue
5254  * when the inode update is complete (or obsolete).  The main exception to
5255  * this is when an allocation occurs while a pending allocation dependency
5256  * (for the same block pointer) remains.  This case is handled in the main
5257  * allocation dependency setup procedure by immediately freeing the
5258  * unreferenced fragments.
5259  */
5260 void
5261 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5262         struct inode *ip;       /* inode to which block is being added */
5263         ufs_lbn_t off;          /* block pointer within inode */
5264         ufs2_daddr_t newblkno;  /* disk block number being added */
5265         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
5266         long newsize;           /* size of new block */
5267         long oldsize;           /* size of new block */
5268         struct buf *bp;         /* bp for allocated block */
5269 {
5270         struct allocdirect *adp, *oldadp;
5271         struct allocdirectlst *adphead;
5272         struct freefrag *freefrag;
5273         struct inodedep *inodedep;
5274         struct pagedep *pagedep;
5275         struct jnewblk *jnewblk;
5276         struct newblk *newblk;
5277         struct mount *mp;
5278         ufs_lbn_t lbn;
5279
5280         lbn = bp->b_lblkno;
5281         mp = UFSTOVFS(ip->i_ump);
5282         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5283             ("softdep_setup_allocdirect called on non-softdep filesystem"));
5284         if (oldblkno && oldblkno != newblkno)
5285                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5286         else
5287                 freefrag = NULL;
5288
5289         CTR6(KTR_SUJ,
5290             "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5291             "off %jd newsize %ld oldsize %d",
5292             ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5293         ACQUIRE_LOCK(ip->i_ump);
5294         if (off >= NDADDR) {
5295                 if (lbn > 0)
5296                         panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5297                             lbn, off);
5298                 /* allocating an indirect block */
5299                 if (oldblkno != 0)
5300                         panic("softdep_setup_allocdirect: non-zero indir");
5301         } else {
5302                 if (off != lbn)
5303                         panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5304                             lbn, off);
5305                 /*
5306                  * Allocating a direct block.
5307                  *
5308                  * If we are allocating a directory block, then we must
5309                  * allocate an associated pagedep to track additions and
5310                  * deletions.
5311                  */
5312                 if ((ip->i_mode & IFMT) == IFDIR)
5313                         pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5314                             &pagedep);
5315         }
5316         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5317                 panic("softdep_setup_allocdirect: lost block");
5318         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5319             ("softdep_setup_allocdirect: newblk already initialized"));
5320         /*
5321          * Convert the newblk to an allocdirect.
5322          */
5323         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5324         adp = (struct allocdirect *)newblk;
5325         newblk->nb_freefrag = freefrag;
5326         adp->ad_offset = off;
5327         adp->ad_oldblkno = oldblkno;
5328         adp->ad_newsize = newsize;
5329         adp->ad_oldsize = oldsize;
5330
5331         /*
5332          * Finish initializing the journal.
5333          */
5334         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5335                 jnewblk->jn_ino = ip->i_number;
5336                 jnewblk->jn_lbn = lbn;
5337                 add_to_journal(&jnewblk->jn_list);
5338         }
5339         if (freefrag && freefrag->ff_jdep != NULL &&
5340             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5341                 add_to_journal(freefrag->ff_jdep);
5342         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5343         adp->ad_inodedep = inodedep;
5344
5345         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5346         /*
5347          * The list of allocdirects must be kept in sorted and ascending
5348          * order so that the rollback routines can quickly determine the
5349          * first uncommitted block (the size of the file stored on disk
5350          * ends at the end of the lowest committed fragment, or if there
5351          * are no fragments, at the end of the highest committed block).
5352          * Since files generally grow, the typical case is that the new
5353          * block is to be added at the end of the list. We speed this
5354          * special case by checking against the last allocdirect in the
5355          * list before laboriously traversing the list looking for the
5356          * insertion point.
5357          */
5358         adphead = &inodedep->id_newinoupdt;
5359         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5360         if (oldadp == NULL || oldadp->ad_offset <= off) {
5361                 /* insert at end of list */
5362                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5363                 if (oldadp != NULL && oldadp->ad_offset == off)
5364                         allocdirect_merge(adphead, adp, oldadp);
5365                 FREE_LOCK(ip->i_ump);
5366                 return;
5367         }
5368         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5369                 if (oldadp->ad_offset >= off)
5370                         break;
5371         }
5372         if (oldadp == NULL)
5373                 panic("softdep_setup_allocdirect: lost entry");
5374         /* insert in middle of list */
5375         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5376         if (oldadp->ad_offset == off)
5377                 allocdirect_merge(adphead, adp, oldadp);
5378
5379         FREE_LOCK(ip->i_ump);
5380 }
5381
5382 /*
5383  * Merge a newer and older journal record to be stored either in a
5384  * newblock or freefrag.  This handles aggregating journal records for
5385  * fragment allocation into a second record as well as replacing a
5386  * journal free with an aborted journal allocation.  A segment for the
5387  * oldest record will be placed on wkhd if it has been written.  If not
5388  * the segment for the newer record will suffice.
5389  */
5390 static struct worklist *
5391 jnewblk_merge(new, old, wkhd)
5392         struct worklist *new;
5393         struct worklist *old;
5394         struct workhead *wkhd;
5395 {
5396         struct jnewblk *njnewblk;
5397         struct jnewblk *jnewblk;
5398
5399         /* Handle NULLs to simplify callers. */
5400         if (new == NULL)
5401                 return (old);
5402         if (old == NULL)
5403                 return (new);
5404         /* Replace a jfreefrag with a jnewblk. */
5405         if (new->wk_type == D_JFREEFRAG) {
5406                 if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5407                         panic("jnewblk_merge: blkno mismatch: %p, %p",
5408                             old, new);
5409                 cancel_jfreefrag(WK_JFREEFRAG(new));
5410                 return (old);
5411         }
5412         if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5413                 panic("jnewblk_merge: Bad type: old %d new %d\n",
5414                     old->wk_type, new->wk_type);
5415         /*
5416          * Handle merging of two jnewblk records that describe
5417          * different sets of fragments in the same block.
5418          */
5419         jnewblk = WK_JNEWBLK(old);
5420         njnewblk = WK_JNEWBLK(new);
5421         if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5422                 panic("jnewblk_merge: Merging disparate blocks.");
5423         /*
5424          * The record may be rolled back in the cg.
5425          */
5426         if (jnewblk->jn_state & UNDONE) {
5427                 jnewblk->jn_state &= ~UNDONE;
5428                 njnewblk->jn_state |= UNDONE;
5429                 njnewblk->jn_state &= ~ATTACHED;
5430         }
5431         /*
5432          * We modify the newer addref and free the older so that if neither
5433          * has been written the most up-to-date copy will be on disk.  If
5434          * both have been written but rolled back we only temporarily need
5435          * one of them to fix the bits when the cg write completes.
5436          */
5437         jnewblk->jn_state |= ATTACHED | COMPLETE;
5438         njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5439         cancel_jnewblk(jnewblk, wkhd);
5440         WORKLIST_REMOVE(&jnewblk->jn_list);
5441         free_jnewblk(jnewblk);
5442         return (new);
5443 }
5444
5445 /*
5446  * Replace an old allocdirect dependency with a newer one.
5447  * This routine must be called with splbio interrupts blocked.
5448  */
5449 static void
5450 allocdirect_merge(adphead, newadp, oldadp)
5451         struct allocdirectlst *adphead; /* head of list holding allocdirects */
5452         struct allocdirect *newadp;     /* allocdirect being added */
5453         struct allocdirect *oldadp;     /* existing allocdirect being checked */
5454 {
5455         struct worklist *wk;
5456         struct freefrag *freefrag;
5457
5458         freefrag = NULL;
5459         LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5460         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5461             newadp->ad_oldsize != oldadp->ad_newsize ||
5462             newadp->ad_offset >= NDADDR)
5463                 panic("%s %jd != new %jd || old size %ld != new %ld",
5464                     "allocdirect_merge: old blkno",
5465                     (intmax_t)newadp->ad_oldblkno,
5466                     (intmax_t)oldadp->ad_newblkno,
5467                     newadp->ad_oldsize, oldadp->ad_newsize);
5468         newadp->ad_oldblkno = oldadp->ad_oldblkno;
5469         newadp->ad_oldsize = oldadp->ad_oldsize;
5470         /*
5471          * If the old dependency had a fragment to free or had never
5472          * previously had a block allocated, then the new dependency
5473          * can immediately post its freefrag and adopt the old freefrag.
5474          * This action is done by swapping the freefrag dependencies.
5475          * The new dependency gains the old one's freefrag, and the
5476          * old one gets the new one and then immediately puts it on
5477          * the worklist when it is freed by free_newblk. It is
5478          * not possible to do this swap when the old dependency had a
5479          * non-zero size but no previous fragment to free. This condition
5480          * arises when the new block is an extension of the old block.
5481          * Here, the first part of the fragment allocated to the new
5482          * dependency is part of the block currently claimed on disk by
5483          * the old dependency, so cannot legitimately be freed until the
5484          * conditions for the new dependency are fulfilled.
5485          */
5486         freefrag = newadp->ad_freefrag;
5487         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5488                 newadp->ad_freefrag = oldadp->ad_freefrag;
5489                 oldadp->ad_freefrag = freefrag;
5490         }
5491         /*
5492          * If we are tracking a new directory-block allocation,
5493          * move it from the old allocdirect to the new allocdirect.
5494          */
5495         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5496                 WORKLIST_REMOVE(wk);
5497                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5498                         panic("allocdirect_merge: extra newdirblk");
5499                 WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5500         }
5501         TAILQ_REMOVE(adphead, oldadp, ad_next);
5502         /*
5503          * We need to move any journal dependencies over to the freefrag
5504          * that releases this block if it exists.  Otherwise we are
5505          * extending an existing block and we'll wait until that is
5506          * complete to release the journal space and extend the
5507          * new journal to cover this old space as well.
5508          */
5509         if (freefrag == NULL) {
5510                 if (oldadp->ad_newblkno != newadp->ad_newblkno)
5511                         panic("allocdirect_merge: %jd != %jd",
5512                             oldadp->ad_newblkno, newadp->ad_newblkno);
5513                 newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5514                     jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5515                     &oldadp->ad_block.nb_jnewblk->jn_list,
5516                     &newadp->ad_block.nb_jwork);
5517                 oldadp->ad_block.nb_jnewblk = NULL;
5518                 cancel_newblk(&oldadp->ad_block, NULL,
5519                     &newadp->ad_block.nb_jwork);
5520         } else {
5521                 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5522                     &freefrag->ff_list, &freefrag->ff_jwork);
5523                 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5524                     &freefrag->ff_jwork);
5525         }
5526         free_newblk(&oldadp->ad_block);
5527 }
5528
5529 /*
5530  * Allocate a jfreefrag structure to journal a single block free.
5531  */
5532 static struct jfreefrag *
5533 newjfreefrag(freefrag, ip, blkno, size, lbn)
5534         struct freefrag *freefrag;
5535         struct inode *ip;
5536         ufs2_daddr_t blkno;
5537         long size;
5538         ufs_lbn_t lbn;
5539 {
5540         struct jfreefrag *jfreefrag;
5541         struct fs *fs;
5542
5543         fs = ip->i_fs;
5544         jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5545             M_SOFTDEP_FLAGS);
5546         workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5547         jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5548         jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5549         jfreefrag->fr_ino = ip->i_number;
5550         jfreefrag->fr_lbn = lbn;
5551         jfreefrag->fr_blkno = blkno;
5552         jfreefrag->fr_frags = numfrags(fs, size);
5553         jfreefrag->fr_freefrag = freefrag;
5554
5555         return (jfreefrag);
5556 }
5557
5558 /*
5559  * Allocate a new freefrag structure.
5560  */
5561 static struct freefrag *
5562 newfreefrag(ip, blkno, size, lbn)
5563         struct inode *ip;
5564         ufs2_daddr_t blkno;
5565         long size;
5566         ufs_lbn_t lbn;
5567 {
5568         struct freefrag *freefrag;
5569         struct fs *fs;
5570
5571         CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5572             ip->i_number, blkno, size, lbn);
5573         fs = ip->i_fs;
5574         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5575                 panic("newfreefrag: frag size");
5576         freefrag = malloc(sizeof(struct freefrag),
5577             M_FREEFRAG, M_SOFTDEP_FLAGS);
5578         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5579         freefrag->ff_state = ATTACHED;
5580         LIST_INIT(&freefrag->ff_jwork);
5581         freefrag->ff_inum = ip->i_number;
5582         freefrag->ff_vtype = ITOV(ip)->v_type;
5583         freefrag->ff_blkno = blkno;
5584         freefrag->ff_fragsize = size;
5585
5586         if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5587                 freefrag->ff_jdep = (struct worklist *)
5588                     newjfreefrag(freefrag, ip, blkno, size, lbn);
5589         } else {
5590                 freefrag->ff_state |= DEPCOMPLETE;
5591                 freefrag->ff_jdep = NULL;
5592         }
5593
5594         return (freefrag);
5595 }
5596
5597 /*
5598  * This workitem de-allocates fragments that were replaced during
5599  * file block allocation.
5600  */
5601 static void
5602 handle_workitem_freefrag(freefrag)
5603         struct freefrag *freefrag;
5604 {
5605         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5606         struct workhead wkhd;
5607
5608         CTR3(KTR_SUJ,
5609             "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5610             freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5611         /*
5612          * It would be illegal to add new completion items to the
5613          * freefrag after it was schedule to be done so it must be
5614          * safe to modify the list head here.
5615          */
5616         LIST_INIT(&wkhd);
5617         ACQUIRE_LOCK(ump);
5618         LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5619         /*
5620          * If the journal has not been written we must cancel it here.
5621          */
5622         if (freefrag->ff_jdep) {
5623                 if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5624                         panic("handle_workitem_freefrag: Unexpected type %d\n",
5625                             freefrag->ff_jdep->wk_type);
5626                 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5627         }
5628         FREE_LOCK(ump);
5629         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5630            freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5631         ACQUIRE_LOCK(ump);
5632         WORKITEM_FREE(freefrag, D_FREEFRAG);
5633         FREE_LOCK(ump);
5634 }
5635
5636 /*
5637  * Set up a dependency structure for an external attributes data block.
5638  * This routine follows much of the structure of softdep_setup_allocdirect.
5639  * See the description of softdep_setup_allocdirect above for details.
5640  */
5641 void
5642 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5643         struct inode *ip;
5644         ufs_lbn_t off;
5645         ufs2_daddr_t newblkno;
5646         ufs2_daddr_t oldblkno;
5647         long newsize;
5648         long oldsize;
5649         struct buf *bp;
5650 {
5651         struct allocdirect *adp, *oldadp;
5652         struct allocdirectlst *adphead;
5653         struct freefrag *freefrag;
5654         struct inodedep *inodedep;
5655         struct jnewblk *jnewblk;
5656         struct newblk *newblk;
5657         struct mount *mp;
5658         ufs_lbn_t lbn;
5659
5660         mp = UFSTOVFS(ip->i_ump);
5661         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5662             ("softdep_setup_allocext called on non-softdep filesystem"));
5663         KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
5664                     (long long)off));
5665
5666         lbn = bp->b_lblkno;
5667         if (oldblkno && oldblkno != newblkno)
5668                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5669         else
5670                 freefrag = NULL;
5671
5672         ACQUIRE_LOCK(ip->i_ump);
5673         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5674                 panic("softdep_setup_allocext: lost block");
5675         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5676             ("softdep_setup_allocext: newblk already initialized"));
5677         /*
5678          * Convert the newblk to an allocdirect.
5679          */
5680         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5681         adp = (struct allocdirect *)newblk;
5682         newblk->nb_freefrag = freefrag;
5683         adp->ad_offset = off;
5684         adp->ad_oldblkno = oldblkno;
5685         adp->ad_newsize = newsize;
5686         adp->ad_oldsize = oldsize;
5687         adp->ad_state |=  EXTDATA;
5688
5689         /*
5690          * Finish initializing the journal.
5691          */
5692         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5693                 jnewblk->jn_ino = ip->i_number;
5694                 jnewblk->jn_lbn = lbn;
5695                 add_to_journal(&jnewblk->jn_list);
5696         }
5697         if (freefrag && freefrag->ff_jdep != NULL &&
5698             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5699                 add_to_journal(freefrag->ff_jdep);
5700         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5701         adp->ad_inodedep = inodedep;
5702
5703         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5704         /*
5705          * The list of allocdirects must be kept in sorted and ascending
5706          * order so that the rollback routines can quickly determine the
5707          * first uncommitted block (the size of the file stored on disk
5708          * ends at the end of the lowest committed fragment, or if there
5709          * are no fragments, at the end of the highest committed block).
5710          * Since files generally grow, the typical case is that the new
5711          * block is to be added at the end of the list. We speed this
5712          * special case by checking against the last allocdirect in the
5713          * list before laboriously traversing the list looking for the
5714          * insertion point.
5715          */
5716         adphead = &inodedep->id_newextupdt;
5717         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5718         if (oldadp == NULL || oldadp->ad_offset <= off) {
5719                 /* insert at end of list */
5720                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5721                 if (oldadp != NULL && oldadp->ad_offset == off)
5722                         allocdirect_merge(adphead, adp, oldadp);
5723                 FREE_LOCK(ip->i_ump);
5724                 return;
5725         }
5726         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5727                 if (oldadp->ad_offset >= off)
5728                         break;
5729         }
5730         if (oldadp == NULL)
5731                 panic("softdep_setup_allocext: lost entry");
5732         /* insert in middle of list */
5733         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5734         if (oldadp->ad_offset == off)
5735                 allocdirect_merge(adphead, adp, oldadp);
5736         FREE_LOCK(ip->i_ump);
5737 }
5738
5739 /*
5740  * Indirect block allocation dependencies.
5741  *
5742  * The same dependencies that exist for a direct block also exist when
5743  * a new block is allocated and pointed to by an entry in a block of
5744  * indirect pointers. The undo/redo states described above are also
5745  * used here. Because an indirect block contains many pointers that
5746  * may have dependencies, a second copy of the entire in-memory indirect
5747  * block is kept. The buffer cache copy is always completely up-to-date.
5748  * The second copy, which is used only as a source for disk writes,
5749  * contains only the safe pointers (i.e., those that have no remaining
5750  * update dependencies). The second copy is freed when all pointers
5751  * are safe. The cache is not allowed to replace indirect blocks with
5752  * pending update dependencies. If a buffer containing an indirect
5753  * block with dependencies is written, these routines will mark it
5754  * dirty again. It can only be successfully written once all the
5755  * dependencies are removed. The ffs_fsync routine in conjunction with
5756  * softdep_sync_metadata work together to get all the dependencies
5757  * removed so that a file can be successfully written to disk. Three
5758  * procedures are used when setting up indirect block pointer
5759  * dependencies. The division is necessary because of the organization
5760  * of the "balloc" routine and because of the distinction between file
5761  * pages and file metadata blocks.
5762  */
5763
5764 /*
5765  * Allocate a new allocindir structure.
5766  */
5767 static struct allocindir *
5768 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5769         struct inode *ip;       /* inode for file being extended */
5770         int ptrno;              /* offset of pointer in indirect block */
5771         ufs2_daddr_t newblkno;  /* disk block number being added */
5772         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
5773         ufs_lbn_t lbn;
5774 {
5775         struct newblk *newblk;
5776         struct allocindir *aip;
5777         struct freefrag *freefrag;
5778         struct jnewblk *jnewblk;
5779
5780         if (oldblkno)
5781                 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5782         else
5783                 freefrag = NULL;
5784         ACQUIRE_LOCK(ip->i_ump);
5785         if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5786                 panic("new_allocindir: lost block");
5787         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5788             ("newallocindir: newblk already initialized"));
5789         WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5790         newblk->nb_freefrag = freefrag;
5791         aip = (struct allocindir *)newblk;
5792         aip->ai_offset = ptrno;
5793         aip->ai_oldblkno = oldblkno;
5794         aip->ai_lbn = lbn;
5795         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5796                 jnewblk->jn_ino = ip->i_number;
5797                 jnewblk->jn_lbn = lbn;
5798                 add_to_journal(&jnewblk->jn_list);
5799         }
5800         if (freefrag && freefrag->ff_jdep != NULL &&
5801             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5802                 add_to_journal(freefrag->ff_jdep);
5803         return (aip);
5804 }
5805
5806 /*
5807  * Called just before setting an indirect block pointer
5808  * to a newly allocated file page.
5809  */
5810 void
5811 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5812         struct inode *ip;       /* inode for file being extended */
5813         ufs_lbn_t lbn;          /* allocated block number within file */
5814         struct buf *bp;         /* buffer with indirect blk referencing page */
5815         int ptrno;              /* offset of pointer in indirect block */
5816         ufs2_daddr_t newblkno;  /* disk block number being added */
5817         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
5818         struct buf *nbp;        /* buffer holding allocated page */
5819 {
5820         struct inodedep *inodedep;
5821         struct freefrag *freefrag;
5822         struct allocindir *aip;
5823         struct pagedep *pagedep;
5824         struct mount *mp;
5825
5826         mp = UFSTOVFS(ip->i_ump);
5827         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5828             ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5829         KASSERT(lbn == nbp->b_lblkno,
5830             ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5831             lbn, bp->b_lblkno));
5832         CTR4(KTR_SUJ,
5833             "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5834             "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5835         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5836         aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5837         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5838         /*
5839          * If we are allocating a directory page, then we must
5840          * allocate an associated pagedep to track additions and
5841          * deletions.
5842          */
5843         if ((ip->i_mode & IFMT) == IFDIR)
5844                 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5845         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5846         freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5847         FREE_LOCK(ip->i_ump);
5848         if (freefrag)
5849                 handle_workitem_freefrag(freefrag);
5850 }
5851
5852 /*
5853  * Called just before setting an indirect block pointer to a
5854  * newly allocated indirect block.
5855  */
5856 void
5857 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5858         struct buf *nbp;        /* newly allocated indirect block */
5859         struct inode *ip;       /* inode for file being extended */
5860         struct buf *bp;         /* indirect block referencing allocated block */
5861         int ptrno;              /* offset of pointer in indirect block */
5862         ufs2_daddr_t newblkno;  /* disk block number being added */
5863 {
5864         struct inodedep *inodedep;
5865         struct allocindir *aip;
5866         ufs_lbn_t lbn;
5867
5868         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
5869             ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5870         CTR3(KTR_SUJ,
5871             "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5872             ip->i_number, newblkno, ptrno);
5873         lbn = nbp->b_lblkno;
5874         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5875         aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5876         inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
5877             &inodedep);
5878         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5879         if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5880                 panic("softdep_setup_allocindir_meta: Block already existed");
5881         FREE_LOCK(ip->i_ump);
5882 }
5883
5884 static void
5885 indirdep_complete(indirdep)
5886         struct indirdep *indirdep;
5887 {
5888         struct allocindir *aip;
5889
5890         LIST_REMOVE(indirdep, ir_next);
5891         indirdep->ir_state |= DEPCOMPLETE;
5892
5893         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5894                 LIST_REMOVE(aip, ai_next);
5895                 free_newblk(&aip->ai_block);
5896         }
5897         /*
5898          * If this indirdep is not attached to a buf it was simply waiting
5899          * on completion to clear completehd.  free_indirdep() asserts
5900          * that nothing is dangling.
5901          */
5902         if ((indirdep->ir_state & ONWORKLIST) == 0)
5903                 free_indirdep(indirdep);
5904 }
5905
5906 static struct indirdep *
5907 indirdep_lookup(mp, ip, bp)
5908         struct mount *mp;
5909         struct inode *ip;
5910         struct buf *bp;
5911 {
5912         struct indirdep *indirdep, *newindirdep;
5913         struct newblk *newblk;
5914         struct ufsmount *ump;
5915         struct worklist *wk;
5916         struct fs *fs;
5917         ufs2_daddr_t blkno;
5918
5919         ump = VFSTOUFS(mp);
5920         LOCK_OWNED(ump);
5921         indirdep = NULL;
5922         newindirdep = NULL;
5923         fs = ip->i_fs;
5924         for (;;) {
5925                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5926                         if (wk->wk_type != D_INDIRDEP)
5927                                 continue;
5928                         indirdep = WK_INDIRDEP(wk);
5929                         break;
5930                 }
5931                 /* Found on the buffer worklist, no new structure to free. */
5932                 if (indirdep != NULL && newindirdep == NULL)
5933                         return (indirdep);
5934                 if (indirdep != NULL && newindirdep != NULL)
5935                         panic("indirdep_lookup: simultaneous create");
5936                 /* None found on the buffer and a new structure is ready. */
5937                 if (indirdep == NULL && newindirdep != NULL)
5938                         break;
5939                 /* None found and no new structure available. */
5940                 FREE_LOCK(ump);
5941                 newindirdep = malloc(sizeof(struct indirdep),
5942                     M_INDIRDEP, M_SOFTDEP_FLAGS);
5943                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5944                 newindirdep->ir_state = ATTACHED;
5945                 if (ip->i_ump->um_fstype == UFS1)
5946                         newindirdep->ir_state |= UFS1FMT;
5947                 TAILQ_INIT(&newindirdep->ir_trunc);
5948                 newindirdep->ir_saveddata = NULL;
5949                 LIST_INIT(&newindirdep->ir_deplisthd);
5950                 LIST_INIT(&newindirdep->ir_donehd);
5951                 LIST_INIT(&newindirdep->ir_writehd);
5952                 LIST_INIT(&newindirdep->ir_completehd);
5953                 if (bp->b_blkno == bp->b_lblkno) {
5954                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5955                             NULL, NULL);
5956                         bp->b_blkno = blkno;
5957                 }
5958                 newindirdep->ir_freeblks = NULL;
5959                 newindirdep->ir_savebp =
5960                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5961                 newindirdep->ir_bp = bp;
5962                 BUF_KERNPROC(newindirdep->ir_savebp);
5963                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5964                 ACQUIRE_LOCK(ump);
5965         }
5966         indirdep = newindirdep;
5967         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5968         /*
5969          * If the block is not yet allocated we don't set DEPCOMPLETE so
5970          * that we don't free dependencies until the pointers are valid.
5971          * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5972          * than using the hash.
5973          */
5974         if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5975                 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5976         else
5977                 indirdep->ir_state |= DEPCOMPLETE;
5978         return (indirdep);
5979 }
5980
5981 /*
5982  * Called to finish the allocation of the "aip" allocated
5983  * by one of the two routines above.
5984  */
5985 static struct freefrag *
5986 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5987         struct buf *bp;         /* in-memory copy of the indirect block */
5988         struct inode *ip;       /* inode for file being extended */
5989         struct inodedep *inodedep; /* Inodedep for ip */
5990         struct allocindir *aip; /* allocindir allocated by the above routines */
5991         ufs_lbn_t lbn;          /* Logical block number for this block. */
5992 {
5993         struct fs *fs;
5994         struct indirdep *indirdep;
5995         struct allocindir *oldaip;
5996         struct freefrag *freefrag;
5997         struct mount *mp;
5998
5999         LOCK_OWNED(ip->i_ump);
6000         mp = UFSTOVFS(ip->i_ump);
6001         fs = ip->i_fs;
6002         if (bp->b_lblkno >= 0)
6003                 panic("setup_allocindir_phase2: not indir blk");
6004         KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6005             ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6006         indirdep = indirdep_lookup(mp, ip, bp);
6007         KASSERT(indirdep->ir_savebp != NULL,
6008             ("setup_allocindir_phase2 NULL ir_savebp"));
6009         aip->ai_indirdep = indirdep;
6010         /*
6011          * Check for an unwritten dependency for this indirect offset.  If
6012          * there is, merge the old dependency into the new one.  This happens
6013          * as a result of reallocblk only.
6014          */
6015         freefrag = NULL;
6016         if (aip->ai_oldblkno != 0) {
6017                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6018                         if (oldaip->ai_offset == aip->ai_offset) {
6019                                 freefrag = allocindir_merge(aip, oldaip);
6020                                 goto done;
6021                         }
6022                 }
6023                 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6024                         if (oldaip->ai_offset == aip->ai_offset) {
6025                                 freefrag = allocindir_merge(aip, oldaip);
6026                                 goto done;
6027                         }
6028                 }
6029         }
6030 done:
6031         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6032         return (freefrag);
6033 }
6034
6035 /*
6036  * Merge two allocindirs which refer to the same block.  Move newblock
6037  * dependencies and setup the freefrags appropriately.
6038  */
6039 static struct freefrag *
6040 allocindir_merge(aip, oldaip)
6041         struct allocindir *aip;
6042         struct allocindir *oldaip;
6043 {
6044         struct freefrag *freefrag;
6045         struct worklist *wk;
6046
6047         if (oldaip->ai_newblkno != aip->ai_oldblkno)
6048                 panic("allocindir_merge: blkno");
6049         aip->ai_oldblkno = oldaip->ai_oldblkno;
6050         freefrag = aip->ai_freefrag;
6051         aip->ai_freefrag = oldaip->ai_freefrag;
6052         oldaip->ai_freefrag = NULL;
6053         KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6054         /*
6055          * If we are tracking a new directory-block allocation,
6056          * move it from the old allocindir to the new allocindir.
6057          */
6058         if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6059                 WORKLIST_REMOVE(wk);
6060                 if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6061                         panic("allocindir_merge: extra newdirblk");
6062                 WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6063         }
6064         /*
6065          * We can skip journaling for this freefrag and just complete
6066          * any pending journal work for the allocindir that is being
6067          * removed after the freefrag completes.
6068          */
6069         if (freefrag->ff_jdep)
6070                 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6071         LIST_REMOVE(oldaip, ai_next);
6072         freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6073             &freefrag->ff_list, &freefrag->ff_jwork);
6074         free_newblk(&oldaip->ai_block);
6075
6076         return (freefrag);
6077 }
6078
6079 static inline void
6080 setup_freedirect(freeblks, ip, i, needj)
6081         struct freeblks *freeblks;
6082         struct inode *ip;
6083         int i;
6084         int needj;
6085 {
6086         ufs2_daddr_t blkno;
6087         int frags;
6088
6089         blkno = DIP(ip, i_db[i]);
6090         if (blkno == 0)
6091                 return;
6092         DIP_SET(ip, i_db[i], 0);
6093         frags = sblksize(ip->i_fs, ip->i_size, i);
6094         frags = numfrags(ip->i_fs, frags);
6095         newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
6096 }
6097
6098 static inline void
6099 setup_freeext(freeblks, ip, i, needj)
6100         struct freeblks *freeblks;
6101         struct inode *ip;
6102         int i;
6103         int needj;
6104 {
6105         ufs2_daddr_t blkno;
6106         int frags;
6107
6108         blkno = ip->i_din2->di_extb[i];
6109         if (blkno == 0)
6110                 return;
6111         ip->i_din2->di_extb[i] = 0;
6112         frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
6113         frags = numfrags(ip->i_fs, frags);
6114         newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6115 }
6116
6117 static inline void
6118 setup_freeindir(freeblks, ip, i, lbn, needj)
6119         struct freeblks *freeblks;
6120         struct inode *ip;
6121         int i;
6122         ufs_lbn_t lbn;
6123         int needj;
6124 {
6125         ufs2_daddr_t blkno;
6126
6127         blkno = DIP(ip, i_ib[i]);
6128         if (blkno == 0)
6129                 return;
6130         DIP_SET(ip, i_ib[i], 0);
6131         newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
6132             0, needj);
6133 }
6134
6135 static inline struct freeblks *
6136 newfreeblks(mp, ip)
6137         struct mount *mp;
6138         struct inode *ip;
6139 {
6140         struct freeblks *freeblks;
6141
6142         freeblks = malloc(sizeof(struct freeblks),
6143                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6144         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6145         LIST_INIT(&freeblks->fb_jblkdephd);
6146         LIST_INIT(&freeblks->fb_jwork);
6147         freeblks->fb_ref = 0;
6148         freeblks->fb_cgwait = 0;
6149         freeblks->fb_state = ATTACHED;
6150         freeblks->fb_uid = ip->i_uid;
6151         freeblks->fb_inum = ip->i_number;
6152         freeblks->fb_vtype = ITOV(ip)->v_type;
6153         freeblks->fb_modrev = DIP(ip, i_modrev);
6154         freeblks->fb_devvp = ip->i_devvp;
6155         freeblks->fb_chkcnt = 0;
6156         freeblks->fb_len = 0;
6157
6158         return (freeblks);
6159 }
6160
6161 static void
6162 trunc_indirdep(indirdep, freeblks, bp, off)
6163         struct indirdep *indirdep;
6164         struct freeblks *freeblks;
6165         struct buf *bp;
6166         int off;
6167 {
6168         struct allocindir *aip, *aipn;
6169
6170         /*
6171          * The first set of allocindirs won't be in savedbp.
6172          */
6173         LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6174                 if (aip->ai_offset > off)
6175                         cancel_allocindir(aip, bp, freeblks, 1);
6176         LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6177                 if (aip->ai_offset > off)
6178                         cancel_allocindir(aip, bp, freeblks, 1);
6179         /*
6180          * These will exist in savedbp.
6181          */
6182         LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6183                 if (aip->ai_offset > off)
6184                         cancel_allocindir(aip, NULL, freeblks, 0);
6185         LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6186                 if (aip->ai_offset > off)
6187                         cancel_allocindir(aip, NULL, freeblks, 0);
6188 }
6189
6190 /*
6191  * Follow the chain of indirects down to lastlbn creating a freework
6192  * structure for each.  This will be used to start indir_trunc() at
6193  * the right offset and create the journal records for the parrtial
6194  * truncation.  A second step will handle the truncated dependencies.
6195  */
6196 static int
6197 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6198         struct freeblks *freeblks;
6199         struct inode *ip;
6200         ufs_lbn_t lbn;
6201         ufs_lbn_t lastlbn;
6202         ufs2_daddr_t blkno;
6203 {
6204         struct indirdep *indirdep;
6205         struct indirdep *indirn;
6206         struct freework *freework;
6207         struct newblk *newblk;
6208         struct mount *mp;
6209         struct buf *bp;
6210         uint8_t *start;
6211         uint8_t *end;
6212         ufs_lbn_t lbnadd;
6213         int level;
6214         int error;
6215         int off;
6216
6217
6218         freework = NULL;
6219         if (blkno == 0)
6220                 return (0);
6221         mp = freeblks->fb_list.wk_mp;
6222         bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6223         if ((bp->b_flags & B_CACHE) == 0) {
6224                 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6225                 bp->b_iocmd = BIO_READ;
6226                 bp->b_flags &= ~B_INVAL;
6227                 bp->b_ioflags &= ~BIO_ERROR;
6228                 vfs_busy_pages(bp, 0);
6229                 bp->b_iooffset = dbtob(bp->b_blkno);
6230                 bstrategy(bp);
6231                 curthread->td_ru.ru_inblock++;
6232                 error = bufwait(bp);
6233                 if (error) {
6234                         brelse(bp);
6235                         return (error);
6236                 }
6237         }
6238         level = lbn_level(lbn);
6239         lbnadd = lbn_offset(ip->i_fs, level);
6240         /*
6241          * Compute the offset of the last block we want to keep.  Store
6242          * in the freework the first block we want to completely free.
6243          */
6244         off = (lastlbn - -(lbn + level)) / lbnadd;
6245         if (off + 1 == NINDIR(ip->i_fs))
6246                 goto nowork;
6247         freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6248             0);
6249         /*
6250          * Link the freework into the indirdep.  This will prevent any new
6251          * allocations from proceeding until we are finished with the
6252          * truncate and the block is written.
6253          */
6254         ACQUIRE_LOCK(ip->i_ump);
6255         indirdep = indirdep_lookup(mp, ip, bp);
6256         if (indirdep->ir_freeblks)
6257                 panic("setup_trunc_indir: indirdep already truncated.");
6258         TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6259         freework->fw_indir = indirdep;
6260         /*
6261          * Cancel any allocindirs that will not make it to disk.
6262          * We have to do this for all copies of the indirdep that
6263          * live on this newblk.
6264          */
6265         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6266                 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6267                 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6268                         trunc_indirdep(indirn, freeblks, bp, off);
6269         } else
6270                 trunc_indirdep(indirdep, freeblks, bp, off);
6271         FREE_LOCK(ip->i_ump);
6272         /*
6273          * Creation is protected by the buf lock. The saveddata is only
6274          * needed if a full truncation follows a partial truncation but it
6275          * is difficult to allocate in that case so we fetch it anyway.
6276          */
6277         if (indirdep->ir_saveddata == NULL)
6278                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6279                     M_SOFTDEP_FLAGS);
6280 nowork:
6281         /* Fetch the blkno of the child and the zero start offset. */
6282         if (ip->i_ump->um_fstype == UFS1) {
6283                 blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6284                 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6285         } else {
6286                 blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6287                 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6288         }
6289         if (freework) {
6290                 /* Zero the truncated pointers. */
6291                 end = bp->b_data + bp->b_bcount;
6292                 bzero(start, end - start);
6293                 bdwrite(bp);
6294         } else
6295                 bqrelse(bp);
6296         if (level == 0)
6297                 return (0);
6298         lbn++; /* adjust level */
6299         lbn -= (off * lbnadd);
6300         return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6301 }
6302
6303 /*
6304  * Complete the partial truncation of an indirect block setup by
6305  * setup_trunc_indir().  This zeros the truncated pointers in the saved
6306  * copy and writes them to disk before the freeblks is allowed to complete.
6307  */
6308 static void
6309 complete_trunc_indir(freework)
6310         struct freework *freework;
6311 {
6312         struct freework *fwn;
6313         struct indirdep *indirdep;
6314         struct ufsmount *ump;
6315         struct buf *bp;
6316         uintptr_t start;
6317         int count;
6318
6319         ump = VFSTOUFS(freework->fw_list.wk_mp);
6320         LOCK_OWNED(ump);
6321         indirdep = freework->fw_indir;
6322         for (;;) {
6323                 bp = indirdep->ir_bp;
6324                 /* See if the block was discarded. */
6325                 if (bp == NULL)
6326                         break;
6327                 /* Inline part of getdirtybuf().  We dont want bremfree. */
6328                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6329                         break;
6330                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6331                     LOCK_PTR(ump)) == 0)
6332                         BUF_UNLOCK(bp);
6333                 ACQUIRE_LOCK(ump);
6334         }
6335         freework->fw_state |= DEPCOMPLETE;
6336         TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6337         /*
6338          * Zero the pointers in the saved copy.
6339          */
6340         if (indirdep->ir_state & UFS1FMT)
6341                 start = sizeof(ufs1_daddr_t);
6342         else
6343                 start = sizeof(ufs2_daddr_t);
6344         start *= freework->fw_start;
6345         count = indirdep->ir_savebp->b_bcount - start;
6346         start += (uintptr_t)indirdep->ir_savebp->b_data;
6347         bzero((char *)start, count);
6348         /*
6349          * We need to start the next truncation in the list if it has not
6350          * been started yet.
6351          */
6352         fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6353         if (fwn != NULL) {
6354                 if (fwn->fw_freeblks == indirdep->ir_freeblks)
6355                         TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6356                 if ((fwn->fw_state & ONWORKLIST) == 0)
6357                         freework_enqueue(fwn);
6358         }
6359         /*
6360          * If bp is NULL the block was fully truncated, restore
6361          * the saved block list otherwise free it if it is no
6362          * longer needed.
6363          */
6364         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6365                 if (bp == NULL)
6366                         bcopy(indirdep->ir_saveddata,
6367                             indirdep->ir_savebp->b_data,
6368                             indirdep->ir_savebp->b_bcount);
6369                 free(indirdep->ir_saveddata, M_INDIRDEP);
6370                 indirdep->ir_saveddata = NULL;
6371         }
6372         /*
6373          * When bp is NULL there is a full truncation pending.  We
6374          * must wait for this full truncation to be journaled before
6375          * we can release this freework because the disk pointers will
6376          * never be written as zero.
6377          */
6378         if (bp == NULL)  {
6379                 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6380                         handle_written_freework(freework);
6381                 else
6382                         WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6383                            &freework->fw_list);
6384         } else {
6385                 /* Complete when the real copy is written. */
6386                 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6387                 BUF_UNLOCK(bp);
6388         }
6389 }
6390
6391 /*
6392  * Calculate the number of blocks we are going to release where datablocks
6393  * is the current total and length is the new file size.
6394  */
6395 static ufs2_daddr_t
6396 blkcount(fs, datablocks, length)
6397         struct fs *fs;
6398         ufs2_daddr_t datablocks;
6399         off_t length;
6400 {
6401         off_t totblks, numblks;
6402
6403         totblks = 0;
6404         numblks = howmany(length, fs->fs_bsize);
6405         if (numblks <= NDADDR) {
6406                 totblks = howmany(length, fs->fs_fsize);
6407                 goto out;
6408         }
6409         totblks = blkstofrags(fs, numblks);
6410         numblks -= NDADDR;
6411         /*
6412          * Count all single, then double, then triple indirects required.
6413          * Subtracting one indirects worth of blocks for each pass
6414          * acknowledges one of each pointed to by the inode.
6415          */
6416         for (;;) {
6417                 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6418                 numblks -= NINDIR(fs);
6419                 if (numblks <= 0)
6420                         break;
6421                 numblks = howmany(numblks, NINDIR(fs));
6422         }
6423 out:
6424         totblks = fsbtodb(fs, totblks);
6425         /*
6426          * Handle sparse files.  We can't reclaim more blocks than the inode
6427          * references.  We will correct it later in handle_complete_freeblks()
6428          * when we know the real count.
6429          */
6430         if (totblks > datablocks)
6431                 return (0);
6432         return (datablocks - totblks);
6433 }
6434
6435 /*
6436  * Handle freeblocks for journaled softupdate filesystems.
6437  *
6438  * Contrary to normal softupdates, we must preserve the block pointers in
6439  * indirects until their subordinates are free.  This is to avoid journaling
6440  * every block that is freed which may consume more space than the journal
6441  * itself.  The recovery program will see the free block journals at the
6442  * base of the truncated area and traverse them to reclaim space.  The
6443  * pointers in the inode may be cleared immediately after the journal
6444  * records are written because each direct and indirect pointer in the
6445  * inode is recorded in a journal.  This permits full truncation to proceed
6446  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6447  *
6448  * The algorithm is as follows:
6449  * 1) Traverse the in-memory state and create journal entries to release
6450  *    the relevant blocks and full indirect trees.
6451  * 2) Traverse the indirect block chain adding partial truncation freework
6452  *    records to indirects in the path to lastlbn.  The freework will
6453  *    prevent new allocation dependencies from being satisfied in this
6454  *    indirect until the truncation completes.
6455  * 3) Read and lock the inode block, performing an update with the new size
6456  *    and pointers.  This prevents truncated data from becoming valid on
6457  *    disk through step 4.
6458  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6459  *    eliminate journal work for those records that do not require it.
6460  * 5) Schedule the journal records to be written followed by the inode block.
6461  * 6) Allocate any necessary frags for the end of file.
6462  * 7) Zero any partially truncated blocks.
6463  *
6464  * From this truncation proceeds asynchronously using the freework and
6465  * indir_trunc machinery.  The file will not be extended again into a
6466  * partially truncated indirect block until all work is completed but
6467  * the normal dependency mechanism ensures that it is rolled back/forward
6468  * as appropriate.  Further truncation may occur without delay and is
6469  * serialized in indir_trunc().
6470  */
6471 void
6472 softdep_journal_freeblocks(ip, cred, length, flags)
6473         struct inode *ip;       /* The inode whose length is to be reduced */
6474         struct ucred *cred;
6475         off_t length;           /* The new length for the file */
6476         int flags;              /* IO_EXT and/or IO_NORMAL */
6477 {
6478         struct freeblks *freeblks, *fbn;
6479         struct worklist *wk, *wkn;
6480         struct inodedep *inodedep;
6481         struct jblkdep *jblkdep;
6482         struct allocdirect *adp, *adpn;
6483         struct ufsmount *ump;
6484         struct fs *fs;
6485         struct buf *bp;
6486         struct vnode *vp;
6487         struct mount *mp;
6488         ufs2_daddr_t extblocks, datablocks;
6489         ufs_lbn_t tmpval, lbn, lastlbn;
6490         int frags, lastoff, iboff, allocblock, needj, error, i;
6491
6492         fs = ip->i_fs;
6493         ump = ip->i_ump;
6494         mp = UFSTOVFS(ump);
6495         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6496             ("softdep_journal_freeblocks called on non-softdep filesystem"));
6497         vp = ITOV(ip);
6498         needj = 1;
6499         iboff = -1;
6500         allocblock = 0;
6501         extblocks = 0;
6502         datablocks = 0;
6503         frags = 0;
6504         freeblks = newfreeblks(mp, ip);
6505         ACQUIRE_LOCK(ump);
6506         /*
6507          * If we're truncating a removed file that will never be written
6508          * we don't need to journal the block frees.  The canceled journals
6509          * for the allocations will suffice.
6510          */
6511         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6512         if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6513             length == 0)
6514                 needj = 0;
6515         CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6516             ip->i_number, length, needj);
6517         FREE_LOCK(ump);
6518         /*
6519          * Calculate the lbn that we are truncating to.  This results in -1
6520          * if we're truncating the 0 bytes.  So it is the last lbn we want
6521          * to keep, not the first lbn we want to truncate.
6522          */
6523         lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6524         lastoff = blkoff(fs, length);
6525         /*
6526          * Compute frags we are keeping in lastlbn.  0 means all.
6527          */
6528         if (lastlbn >= 0 && lastlbn < NDADDR) {
6529                 frags = fragroundup(fs, lastoff);
6530                 /* adp offset of last valid allocdirect. */
6531                 iboff = lastlbn;
6532         } else if (lastlbn > 0)
6533                 iboff = NDADDR;
6534         if (fs->fs_magic == FS_UFS2_MAGIC)
6535                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6536         /*
6537          * Handle normal data blocks and indirects.  This section saves
6538          * values used after the inode update to complete frag and indirect
6539          * truncation.
6540          */
6541         if ((flags & IO_NORMAL) != 0) {
6542                 /*
6543                  * Handle truncation of whole direct and indirect blocks.
6544                  */
6545                 for (i = iboff + 1; i < NDADDR; i++)
6546                         setup_freedirect(freeblks, ip, i, needj);
6547                 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6548                     i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6549                         /* Release a whole indirect tree. */
6550                         if (lbn > lastlbn) {
6551                                 setup_freeindir(freeblks, ip, i, -lbn -i,
6552                                     needj);
6553                                 continue;
6554                         }
6555                         iboff = i + NDADDR;
6556                         /*
6557                          * Traverse partially truncated indirect tree.
6558                          */
6559                         if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6560                                 setup_trunc_indir(freeblks, ip, -lbn - i,
6561                                     lastlbn, DIP(ip, i_ib[i]));
6562                 }
6563                 /*
6564                  * Handle partial truncation to a frag boundary.
6565                  */
6566                 if (frags) {
6567                         ufs2_daddr_t blkno;
6568                         long oldfrags;
6569
6570                         oldfrags = blksize(fs, ip, lastlbn);
6571                         blkno = DIP(ip, i_db[lastlbn]);
6572                         if (blkno && oldfrags != frags) {
6573                                 oldfrags -= frags;
6574                                 oldfrags = numfrags(ip->i_fs, oldfrags);
6575                                 blkno += numfrags(ip->i_fs, frags);
6576                                 newfreework(ump, freeblks, NULL, lastlbn,
6577                                     blkno, oldfrags, 0, needj);
6578                                 if (needj)
6579                                         adjust_newfreework(freeblks,
6580                                             numfrags(ip->i_fs, frags));
6581                         } else if (blkno == 0)
6582                                 allocblock = 1;
6583                 }
6584                 /*
6585                  * Add a journal record for partial truncate if we are
6586                  * handling indirect blocks.  Non-indirects need no extra
6587                  * journaling.
6588                  */
6589                 if (length != 0 && lastlbn >= NDADDR) {
6590                         ip->i_flag |= IN_TRUNCATED;
6591                         newjtrunc(freeblks, length, 0);
6592                 }
6593                 ip->i_size = length;
6594                 DIP_SET(ip, i_size, ip->i_size);
6595                 datablocks = DIP(ip, i_blocks) - extblocks;
6596                 if (length != 0)
6597                         datablocks = blkcount(ip->i_fs, datablocks, length);
6598                 freeblks->fb_len = length;
6599         }
6600         if ((flags & IO_EXT) != 0) {
6601                 for (i = 0; i < NXADDR; i++)
6602                         setup_freeext(freeblks, ip, i, needj);
6603                 ip->i_din2->di_extsize = 0;
6604                 datablocks += extblocks;
6605         }
6606 #ifdef QUOTA
6607         /* Reference the quotas in case the block count is wrong in the end. */
6608         quotaref(vp, freeblks->fb_quota);
6609         (void) chkdq(ip, -datablocks, NOCRED, 0);
6610 #endif
6611         freeblks->fb_chkcnt = -datablocks;
6612         UFS_LOCK(ump);
6613         fs->fs_pendingblocks += datablocks;
6614         UFS_UNLOCK(ump);
6615         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6616         /*
6617          * Handle truncation of incomplete alloc direct dependencies.  We
6618          * hold the inode block locked to prevent incomplete dependencies
6619          * from reaching the disk while we are eliminating those that
6620          * have been truncated.  This is a partially inlined ffs_update().
6621          */
6622         ufs_itimes(vp);
6623         ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6624         error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6625             (int)fs->fs_bsize, cred, &bp);
6626         if (error) {
6627                 brelse(bp);
6628                 softdep_error("softdep_journal_freeblocks", error);
6629                 return;
6630         }
6631         if (bp->b_bufsize == fs->fs_bsize)
6632                 bp->b_flags |= B_CLUSTEROK;
6633         softdep_update_inodeblock(ip, bp, 0);
6634         if (ump->um_fstype == UFS1)
6635                 *((struct ufs1_dinode *)bp->b_data +
6636                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6637         else
6638                 *((struct ufs2_dinode *)bp->b_data +
6639                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6640         ACQUIRE_LOCK(ump);
6641         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6642         if ((inodedep->id_state & IOSTARTED) != 0)
6643                 panic("softdep_setup_freeblocks: inode busy");
6644         /*
6645          * Add the freeblks structure to the list of operations that
6646          * must await the zero'ed inode being written to disk. If we
6647          * still have a bitmap dependency (needj), then the inode
6648          * has never been written to disk, so we can process the
6649          * freeblks below once we have deleted the dependencies.
6650          */
6651         if (needj)
6652                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6653         else
6654                 freeblks->fb_state |= COMPLETE;
6655         if ((flags & IO_NORMAL) != 0) {
6656                 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6657                         if (adp->ad_offset > iboff)
6658                                 cancel_allocdirect(&inodedep->id_inoupdt, adp,
6659                                     freeblks);
6660                         /*
6661                          * Truncate the allocdirect.  We could eliminate
6662                          * or modify journal records as well.
6663                          */
6664                         else if (adp->ad_offset == iboff && frags)
6665                                 adp->ad_newsize = frags;
6666                 }
6667         }
6668         if ((flags & IO_EXT) != 0)
6669                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6670                         cancel_allocdirect(&inodedep->id_extupdt, adp,
6671                             freeblks);
6672         /*
6673          * Scan the bufwait list for newblock dependencies that will never
6674          * make it to disk.
6675          */
6676         LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6677                 if (wk->wk_type != D_ALLOCDIRECT)
6678                         continue;
6679                 adp = WK_ALLOCDIRECT(wk);
6680                 if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6681                     ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6682                         cancel_jfreeblk(freeblks, adp->ad_newblkno);
6683                         cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6684                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6685                 }
6686         }
6687         /*
6688          * Add journal work.
6689          */
6690         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6691                 add_to_journal(&jblkdep->jb_list);
6692         FREE_LOCK(ump);
6693         bdwrite(bp);
6694         /*
6695          * Truncate dependency structures beyond length.
6696          */
6697         trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6698         /*
6699          * This is only set when we need to allocate a fragment because
6700          * none existed at the end of a frag-sized file.  It handles only
6701          * allocating a new, zero filled block.
6702          */
6703         if (allocblock) {
6704                 ip->i_size = length - lastoff;
6705                 DIP_SET(ip, i_size, ip->i_size);
6706                 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6707                 if (error != 0) {
6708                         softdep_error("softdep_journal_freeblks", error);
6709                         return;
6710                 }
6711                 ip->i_size = length;
6712                 DIP_SET(ip, i_size, length);
6713                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
6714                 allocbuf(bp, frags);
6715                 ffs_update(vp, 0);
6716                 bawrite(bp);
6717         } else if (lastoff != 0 && vp->v_type != VDIR) {
6718                 int size;
6719
6720                 /*
6721                  * Zero the end of a truncated frag or block.
6722                  */
6723                 size = sblksize(fs, length, lastlbn);
6724                 error = bread(vp, lastlbn, size, cred, &bp);
6725                 if (error) {
6726                         softdep_error("softdep_journal_freeblks", error);
6727                         return;
6728                 }
6729                 bzero((char *)bp->b_data + lastoff, size - lastoff);
6730                 bawrite(bp);
6731
6732         }
6733         ACQUIRE_LOCK(ump);
6734         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6735         TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6736         freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6737         /*
6738          * We zero earlier truncations so they don't erroneously
6739          * update i_blocks.
6740          */
6741         if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6742                 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6743                         fbn->fb_len = 0;
6744         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6745             LIST_EMPTY(&freeblks->fb_jblkdephd))
6746                 freeblks->fb_state |= INPROGRESS;
6747         else
6748                 freeblks = NULL;
6749         FREE_LOCK(ump);
6750         if (freeblks)
6751                 handle_workitem_freeblocks(freeblks, 0);
6752         trunc_pages(ip, length, extblocks, flags);
6753
6754 }
6755
6756 /*
6757  * Flush a JOP_SYNC to the journal.
6758  */
6759 void
6760 softdep_journal_fsync(ip)
6761         struct inode *ip;
6762 {
6763         struct jfsync *jfsync;
6764
6765         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
6766             ("softdep_journal_fsync called on non-softdep filesystem"));
6767         if ((ip->i_flag & IN_TRUNCATED) == 0)
6768                 return;
6769         ip->i_flag &= ~IN_TRUNCATED;
6770         jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6771         workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6772         jfsync->jfs_size = ip->i_size;
6773         jfsync->jfs_ino = ip->i_number;
6774         ACQUIRE_LOCK(ip->i_ump);
6775         add_to_journal(&jfsync->jfs_list);
6776         jwait(&jfsync->jfs_list, MNT_WAIT);
6777         FREE_LOCK(ip->i_ump);
6778 }
6779
6780 /*
6781  * Block de-allocation dependencies.
6782  *
6783  * When blocks are de-allocated, the on-disk pointers must be nullified before
6784  * the blocks are made available for use by other files.  (The true
6785  * requirement is that old pointers must be nullified before new on-disk
6786  * pointers are set.  We chose this slightly more stringent requirement to
6787  * reduce complexity.) Our implementation handles this dependency by updating
6788  * the inode (or indirect block) appropriately but delaying the actual block
6789  * de-allocation (i.e., freemap and free space count manipulation) until
6790  * after the updated versions reach stable storage.  After the disk is
6791  * updated, the blocks can be safely de-allocated whenever it is convenient.
6792  * This implementation handles only the common case of reducing a file's
6793  * length to zero. Other cases are handled by the conventional synchronous
6794  * write approach.
6795  *
6796  * The ffs implementation with which we worked double-checks
6797  * the state of the block pointers and file size as it reduces
6798  * a file's length.  Some of this code is replicated here in our
6799  * soft updates implementation.  The freeblks->fb_chkcnt field is
6800  * used to transfer a part of this information to the procedure
6801  * that eventually de-allocates the blocks.
6802  *
6803  * This routine should be called from the routine that shortens
6804  * a file's length, before the inode's size or block pointers
6805  * are modified. It will save the block pointer information for
6806  * later release and zero the inode so that the calling routine
6807  * can release it.
6808  */
6809 void
6810 softdep_setup_freeblocks(ip, length, flags)
6811         struct inode *ip;       /* The inode whose length is to be reduced */
6812         off_t length;           /* The new length for the file */
6813         int flags;              /* IO_EXT and/or IO_NORMAL */
6814 {
6815         struct ufs1_dinode *dp1;
6816         struct ufs2_dinode *dp2;
6817         struct freeblks *freeblks;
6818         struct inodedep *inodedep;
6819         struct allocdirect *adp;
6820         struct ufsmount *ump;
6821         struct buf *bp;
6822         struct fs *fs;
6823         ufs2_daddr_t extblocks, datablocks;
6824         struct mount *mp;
6825         int i, delay, error;
6826         ufs_lbn_t tmpval;
6827         ufs_lbn_t lbn;
6828
6829         ump = ip->i_ump;
6830         mp = UFSTOVFS(ump);
6831         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6832             ("softdep_setup_freeblocks called on non-softdep filesystem"));
6833         CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6834             ip->i_number, length);
6835         KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6836         fs = ip->i_fs;
6837         freeblks = newfreeblks(mp, ip);
6838         extblocks = 0;
6839         datablocks = 0;
6840         if (fs->fs_magic == FS_UFS2_MAGIC)
6841                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6842         if ((flags & IO_NORMAL) != 0) {
6843                 for (i = 0; i < NDADDR; i++)
6844                         setup_freedirect(freeblks, ip, i, 0);
6845                 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6846                     i++, lbn += tmpval, tmpval *= NINDIR(fs))
6847                         setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6848                 ip->i_size = 0;
6849                 DIP_SET(ip, i_size, 0);
6850                 datablocks = DIP(ip, i_blocks) - extblocks;
6851         }
6852         if ((flags & IO_EXT) != 0) {
6853                 for (i = 0; i < NXADDR; i++)
6854                         setup_freeext(freeblks, ip, i, 0);
6855                 ip->i_din2->di_extsize = 0;
6856                 datablocks += extblocks;
6857         }
6858 #ifdef QUOTA
6859         /* Reference the quotas in case the block count is wrong in the end. */
6860         quotaref(ITOV(ip), freeblks->fb_quota);
6861         (void) chkdq(ip, -datablocks, NOCRED, 0);
6862 #endif
6863         freeblks->fb_chkcnt = -datablocks;
6864         UFS_LOCK(ump);
6865         fs->fs_pendingblocks += datablocks;
6866         UFS_UNLOCK(ump);
6867         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6868         /*
6869          * Push the zero'ed inode to to its disk buffer so that we are free
6870          * to delete its dependencies below. Once the dependencies are gone
6871          * the buffer can be safely released.
6872          */
6873         if ((error = bread(ip->i_devvp,
6874             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6875             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6876                 brelse(bp);
6877                 softdep_error("softdep_setup_freeblocks", error);
6878         }
6879         if (ump->um_fstype == UFS1) {
6880                 dp1 = ((struct ufs1_dinode *)bp->b_data +
6881                     ino_to_fsbo(fs, ip->i_number));
6882                 ip->i_din1->di_freelink = dp1->di_freelink;
6883                 *dp1 = *ip->i_din1;
6884         } else {
6885                 dp2 = ((struct ufs2_dinode *)bp->b_data +
6886                     ino_to_fsbo(fs, ip->i_number));
6887                 ip->i_din2->di_freelink = dp2->di_freelink;
6888                 *dp2 = *ip->i_din2;
6889         }
6890         /*
6891          * Find and eliminate any inode dependencies.
6892          */
6893         ACQUIRE_LOCK(ump);
6894         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6895         if ((inodedep->id_state & IOSTARTED) != 0)
6896                 panic("softdep_setup_freeblocks: inode busy");
6897         /*
6898          * Add the freeblks structure to the list of operations that
6899          * must await the zero'ed inode being written to disk. If we
6900          * still have a bitmap dependency (delay == 0), then the inode
6901          * has never been written to disk, so we can process the
6902          * freeblks below once we have deleted the dependencies.
6903          */
6904         delay = (inodedep->id_state & DEPCOMPLETE);
6905         if (delay)
6906                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6907         else
6908                 freeblks->fb_state |= COMPLETE;
6909         /*
6910          * Because the file length has been truncated to zero, any
6911          * pending block allocation dependency structures associated
6912          * with this inode are obsolete and can simply be de-allocated.
6913          * We must first merge the two dependency lists to get rid of
6914          * any duplicate freefrag structures, then purge the merged list.
6915          * If we still have a bitmap dependency, then the inode has never
6916          * been written to disk, so we can free any fragments without delay.
6917          */
6918         if (flags & IO_NORMAL) {
6919                 merge_inode_lists(&inodedep->id_newinoupdt,
6920                     &inodedep->id_inoupdt);
6921                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
6922                         cancel_allocdirect(&inodedep->id_inoupdt, adp,
6923                             freeblks);
6924         }
6925         if (flags & IO_EXT) {
6926                 merge_inode_lists(&inodedep->id_newextupdt,
6927                     &inodedep->id_extupdt);
6928                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6929                         cancel_allocdirect(&inodedep->id_extupdt, adp,
6930                             freeblks);
6931         }
6932         FREE_LOCK(ump);
6933         bdwrite(bp);
6934         trunc_dependencies(ip, freeblks, -1, 0, flags);
6935         ACQUIRE_LOCK(ump);
6936         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6937                 (void) free_inodedep(inodedep);
6938         freeblks->fb_state |= DEPCOMPLETE;
6939         /*
6940          * If the inode with zeroed block pointers is now on disk
6941          * we can start freeing blocks.
6942          */
6943         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6944                 freeblks->fb_state |= INPROGRESS;
6945         else
6946                 freeblks = NULL;
6947         FREE_LOCK(ump);
6948         if (freeblks)
6949                 handle_workitem_freeblocks(freeblks, 0);
6950         trunc_pages(ip, length, extblocks, flags);
6951 }
6952
6953 /*
6954  * Eliminate pages from the page cache that back parts of this inode and
6955  * adjust the vnode pager's idea of our size.  This prevents stale data
6956  * from hanging around in the page cache.
6957  */
6958 static void
6959 trunc_pages(ip, length, extblocks, flags)
6960         struct inode *ip;
6961         off_t length;
6962         ufs2_daddr_t extblocks;
6963         int flags;
6964 {
6965         struct vnode *vp;
6966         struct fs *fs;
6967         ufs_lbn_t lbn;
6968         off_t end, extend;
6969
6970         vp = ITOV(ip);
6971         fs = ip->i_fs;
6972         extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6973         if ((flags & IO_EXT) != 0)
6974                 vn_pages_remove(vp, extend, 0);
6975         if ((flags & IO_NORMAL) == 0)
6976                 return;
6977         BO_LOCK(&vp->v_bufobj);
6978         drain_output(vp);
6979         BO_UNLOCK(&vp->v_bufobj);
6980         /*
6981          * The vnode pager eliminates file pages we eliminate indirects
6982          * below.
6983          */
6984         vnode_pager_setsize(vp, length);
6985         /*
6986          * Calculate the end based on the last indirect we want to keep.  If
6987          * the block extends into indirects we can just use the negative of
6988          * its lbn.  Doubles and triples exist at lower numbers so we must
6989          * be careful not to remove those, if they exist.  double and triple
6990          * indirect lbns do not overlap with others so it is not important
6991          * to verify how many levels are required.
6992          */
6993         lbn = lblkno(fs, length);
6994         if (lbn >= NDADDR) {
6995                 /* Calculate the virtual lbn of the triple indirect. */
6996                 lbn = -lbn - (NIADDR - 1);
6997                 end = OFF_TO_IDX(lblktosize(fs, lbn));
6998         } else
6999                 end = extend;
7000         vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7001 }
7002
7003 /*
7004  * See if the buf bp is in the range eliminated by truncation.
7005  */
7006 static int
7007 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7008         struct buf *bp;
7009         int *blkoffp;
7010         ufs_lbn_t lastlbn;
7011         int lastoff;
7012         int flags;
7013 {
7014         ufs_lbn_t lbn;
7015
7016         *blkoffp = 0;
7017         /* Only match ext/normal blocks as appropriate. */
7018         if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7019             ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7020                 return (0);
7021         /* ALTDATA is always a full truncation. */
7022         if ((bp->b_xflags & BX_ALTDATA) != 0)
7023                 return (1);
7024         /* -1 is full truncation. */
7025         if (lastlbn == -1)
7026                 return (1);
7027         /*
7028          * If this is a partial truncate we only want those
7029          * blocks and indirect blocks that cover the range
7030          * we're after.
7031          */
7032         lbn = bp->b_lblkno;
7033         if (lbn < 0)
7034                 lbn = -(lbn + lbn_level(lbn));
7035         if (lbn < lastlbn)
7036                 return (0);
7037         /* Here we only truncate lblkno if it's partial. */
7038         if (lbn == lastlbn) {
7039                 if (lastoff == 0)
7040                         return (0);
7041                 *blkoffp = lastoff;
7042         }
7043         return (1);
7044 }
7045
7046 /*
7047  * Eliminate any dependencies that exist in memory beyond lblkno:off
7048  */
7049 static void
7050 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7051         struct inode *ip;
7052         struct freeblks *freeblks;
7053         ufs_lbn_t lastlbn;
7054         int lastoff;
7055         int flags;
7056 {
7057         struct bufobj *bo;
7058         struct vnode *vp;
7059         struct buf *bp;
7060         int blkoff;
7061
7062         /*
7063          * We must wait for any I/O in progress to finish so that
7064          * all potential buffers on the dirty list will be visible.
7065          * Once they are all there, walk the list and get rid of
7066          * any dependencies.
7067          */
7068         vp = ITOV(ip);
7069         bo = &vp->v_bufobj;
7070         BO_LOCK(bo);
7071         drain_output(vp);
7072         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7073                 bp->b_vflags &= ~BV_SCANNED;
7074 restart:
7075         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7076                 if (bp->b_vflags & BV_SCANNED)
7077                         continue;
7078                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7079                         bp->b_vflags |= BV_SCANNED;
7080                         continue;
7081                 }
7082                 KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7083                 if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7084                         goto restart;
7085                 BO_UNLOCK(bo);
7086                 if (deallocate_dependencies(bp, freeblks, blkoff))
7087                         bqrelse(bp);
7088                 else
7089                         brelse(bp);
7090                 BO_LOCK(bo);
7091                 goto restart;
7092         }
7093         /*
7094          * Now do the work of vtruncbuf while also matching indirect blocks.
7095          */
7096         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7097                 bp->b_vflags &= ~BV_SCANNED;
7098 cleanrestart:
7099         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7100                 if (bp->b_vflags & BV_SCANNED)
7101                         continue;
7102                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7103                         bp->b_vflags |= BV_SCANNED;
7104                         continue;
7105                 }
7106                 if (BUF_LOCK(bp,
7107                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7108                     BO_LOCKPTR(bo)) == ENOLCK) {
7109                         BO_LOCK(bo);
7110                         goto cleanrestart;
7111                 }
7112                 bp->b_vflags |= BV_SCANNED;
7113                 bremfree(bp);
7114                 if (blkoff != 0) {
7115                         allocbuf(bp, blkoff);
7116                         bqrelse(bp);
7117                 } else {
7118                         bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7119                         brelse(bp);
7120                 }
7121                 BO_LOCK(bo);
7122                 goto cleanrestart;
7123         }
7124         drain_output(vp);
7125         BO_UNLOCK(bo);
7126 }
7127
7128 static int
7129 cancel_pagedep(pagedep, freeblks, blkoff)
7130         struct pagedep *pagedep;
7131         struct freeblks *freeblks;
7132         int blkoff;
7133 {
7134         struct jremref *jremref;
7135         struct jmvref *jmvref;
7136         struct dirrem *dirrem, *tmp;
7137         int i;
7138
7139         /*
7140          * Copy any directory remove dependencies to the list
7141          * to be processed after the freeblks proceeds.  If
7142          * directory entry never made it to disk they
7143          * can be dumped directly onto the work list.
7144          */
7145         LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7146                 /* Skip this directory removal if it is intended to remain. */
7147                 if (dirrem->dm_offset < blkoff)
7148                         continue;
7149                 /*
7150                  * If there are any dirrems we wait for the journal write
7151                  * to complete and then restart the buf scan as the lock
7152                  * has been dropped.
7153                  */
7154                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7155                         jwait(&jremref->jr_list, MNT_WAIT);
7156                         return (ERESTART);
7157                 }
7158                 LIST_REMOVE(dirrem, dm_next);
7159                 dirrem->dm_dirinum = pagedep->pd_ino;
7160                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7161         }
7162         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7163                 jwait(&jmvref->jm_list, MNT_WAIT);
7164                 return (ERESTART);
7165         }
7166         /*
7167          * When we're partially truncating a pagedep we just want to flush
7168          * journal entries and return.  There can not be any adds in the
7169          * truncated portion of the directory and newblk must remain if
7170          * part of the block remains.
7171          */
7172         if (blkoff != 0) {
7173                 struct diradd *dap;
7174
7175                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7176                         if (dap->da_offset > blkoff)
7177                                 panic("cancel_pagedep: diradd %p off %d > %d",
7178                                     dap, dap->da_offset, blkoff);
7179                 for (i = 0; i < DAHASHSZ; i++)
7180                         LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7181                                 if (dap->da_offset > blkoff)
7182                                         panic("cancel_pagedep: diradd %p off %d > %d",
7183                                             dap, dap->da_offset, blkoff);
7184                 return (0);
7185         }
7186         /*
7187          * There should be no directory add dependencies present
7188          * as the directory could not be truncated until all
7189          * children were removed.
7190          */
7191         KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7192             ("deallocate_dependencies: pendinghd != NULL"));
7193         for (i = 0; i < DAHASHSZ; i++)
7194                 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7195                     ("deallocate_dependencies: diraddhd != NULL"));
7196         if ((pagedep->pd_state & NEWBLOCK) != 0)
7197                 free_newdirblk(pagedep->pd_newdirblk);
7198         if (free_pagedep(pagedep) == 0)
7199                 panic("Failed to free pagedep %p", pagedep);
7200         return (0);
7201 }
7202
7203 /*
7204  * Reclaim any dependency structures from a buffer that is about to
7205  * be reallocated to a new vnode. The buffer must be locked, thus,
7206  * no I/O completion operations can occur while we are manipulating
7207  * its associated dependencies. The mutex is held so that other I/O's
7208  * associated with related dependencies do not occur.
7209  */
7210 static int
7211 deallocate_dependencies(bp, freeblks, off)
7212         struct buf *bp;
7213         struct freeblks *freeblks;
7214         int off;
7215 {
7216         struct indirdep *indirdep;
7217         struct pagedep *pagedep;
7218         struct worklist *wk, *wkn;
7219         struct ufsmount *ump;
7220
7221         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
7222                 goto done;
7223         ump = VFSTOUFS(wk->wk_mp);
7224         ACQUIRE_LOCK(ump);
7225         LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7226                 switch (wk->wk_type) {
7227                 case D_INDIRDEP:
7228                         indirdep = WK_INDIRDEP(wk);
7229                         if (bp->b_lblkno >= 0 ||
7230                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7231                                 panic("deallocate_dependencies: not indir");
7232                         cancel_indirdep(indirdep, bp, freeblks);
7233                         continue;
7234
7235                 case D_PAGEDEP:
7236                         pagedep = WK_PAGEDEP(wk);
7237                         if (cancel_pagedep(pagedep, freeblks, off)) {
7238                                 FREE_LOCK(ump);
7239                                 return (ERESTART);
7240                         }
7241                         continue;
7242
7243                 case D_ALLOCINDIR:
7244                         /*
7245                          * Simply remove the allocindir, we'll find it via
7246                          * the indirdep where we can clear pointers if
7247                          * needed.
7248                          */
7249                         WORKLIST_REMOVE(wk);
7250                         continue;
7251
7252                 case D_FREEWORK:
7253                         /*
7254                          * A truncation is waiting for the zero'd pointers
7255                          * to be written.  It can be freed when the freeblks
7256                          * is journaled.
7257                          */
7258                         WORKLIST_REMOVE(wk);
7259                         wk->wk_state |= ONDEPLIST;
7260                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7261                         break;
7262
7263                 case D_ALLOCDIRECT:
7264                         if (off != 0)
7265                                 continue;
7266                         /* FALLTHROUGH */
7267                 default:
7268                         panic("deallocate_dependencies: Unexpected type %s",
7269                             TYPENAME(wk->wk_type));
7270                         /* NOTREACHED */
7271                 }
7272         }
7273         FREE_LOCK(ump);
7274 done:
7275         /*
7276          * Don't throw away this buf, we were partially truncating and
7277          * some deps may always remain.
7278          */
7279         if (off) {
7280                 allocbuf(bp, off);
7281                 bp->b_vflags |= BV_SCANNED;
7282                 return (EBUSY);
7283         }
7284         bp->b_flags |= B_INVAL | B_NOCACHE;
7285
7286         return (0);
7287 }
7288
7289 /*
7290  * An allocdirect is being canceled due to a truncate.  We must make sure
7291  * the journal entry is released in concert with the blkfree that releases
7292  * the storage.  Completed journal entries must not be released until the
7293  * space is no longer pointed to by the inode or in the bitmap.
7294  */
7295 static void
7296 cancel_allocdirect(adphead, adp, freeblks)
7297         struct allocdirectlst *adphead;
7298         struct allocdirect *adp;
7299         struct freeblks *freeblks;
7300 {
7301         struct freework *freework;
7302         struct newblk *newblk;
7303         struct worklist *wk;
7304
7305         TAILQ_REMOVE(adphead, adp, ad_next);
7306         newblk = (struct newblk *)adp;
7307         freework = NULL;
7308         /*
7309          * Find the correct freework structure.
7310          */
7311         LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7312                 if (wk->wk_type != D_FREEWORK)
7313                         continue;
7314                 freework = WK_FREEWORK(wk);
7315                 if (freework->fw_blkno == newblk->nb_newblkno)
7316                         break;
7317         }
7318         if (freework == NULL)
7319                 panic("cancel_allocdirect: Freework not found");
7320         /*
7321          * If a newblk exists at all we still have the journal entry that
7322          * initiated the allocation so we do not need to journal the free.
7323          */
7324         cancel_jfreeblk(freeblks, freework->fw_blkno);
7325         /*
7326          * If the journal hasn't been written the jnewblk must be passed
7327          * to the call to ffs_blkfree that reclaims the space.  We accomplish
7328          * this by linking the journal dependency into the freework to be
7329          * freed when freework_freeblock() is called.  If the journal has
7330          * been written we can simply reclaim the journal space when the
7331          * freeblks work is complete.
7332          */
7333         freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7334             &freeblks->fb_jwork);
7335         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7336 }
7337
7338
7339 /*
7340  * Cancel a new block allocation.  May be an indirect or direct block.  We
7341  * remove it from various lists and return any journal record that needs to
7342  * be resolved by the caller.
7343  *
7344  * A special consideration is made for indirects which were never pointed
7345  * at on disk and will never be found once this block is released.
7346  */
7347 static struct jnewblk *
7348 cancel_newblk(newblk, wk, wkhd)
7349         struct newblk *newblk;
7350         struct worklist *wk;
7351         struct workhead *wkhd;
7352 {
7353         struct jnewblk *jnewblk;
7354
7355         CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7356
7357         newblk->nb_state |= GOINGAWAY;
7358         /*
7359          * Previously we traversed the completedhd on each indirdep
7360          * attached to this newblk to cancel them and gather journal
7361          * work.  Since we need only the oldest journal segment and
7362          * the lowest point on the tree will always have the oldest
7363          * journal segment we are free to release the segments
7364          * of any subordinates and may leave the indirdep list to
7365          * indirdep_complete() when this newblk is freed.
7366          */
7367         if (newblk->nb_state & ONDEPLIST) {
7368                 newblk->nb_state &= ~ONDEPLIST;
7369                 LIST_REMOVE(newblk, nb_deps);
7370         }
7371         if (newblk->nb_state & ONWORKLIST)
7372                 WORKLIST_REMOVE(&newblk->nb_list);
7373         /*
7374          * If the journal entry hasn't been written we save a pointer to
7375          * the dependency that frees it until it is written or the
7376          * superseding operation completes.
7377          */
7378         jnewblk = newblk->nb_jnewblk;
7379         if (jnewblk != NULL && wk != NULL) {
7380                 newblk->nb_jnewblk = NULL;
7381                 jnewblk->jn_dep = wk;
7382         }
7383         if (!LIST_EMPTY(&newblk->nb_jwork))
7384                 jwork_move(wkhd, &newblk->nb_jwork);
7385         /*
7386          * When truncating we must free the newdirblk early to remove
7387          * the pagedep from the hash before returning.
7388          */
7389         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7390                 free_newdirblk(WK_NEWDIRBLK(wk));
7391         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7392                 panic("cancel_newblk: extra newdirblk");
7393
7394         return (jnewblk);
7395 }
7396
7397 /*
7398  * Schedule the freefrag associated with a newblk to be released once
7399  * the pointers are written and the previous block is no longer needed.
7400  */
7401 static void
7402 newblk_freefrag(newblk)
7403         struct newblk *newblk;
7404 {
7405         struct freefrag *freefrag;
7406
7407         if (newblk->nb_freefrag == NULL)
7408                 return;
7409         freefrag = newblk->nb_freefrag;
7410         newblk->nb_freefrag = NULL;
7411         freefrag->ff_state |= COMPLETE;
7412         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7413                 add_to_worklist(&freefrag->ff_list, 0);
7414 }
7415
7416 /*
7417  * Free a newblk. Generate a new freefrag work request if appropriate.
7418  * This must be called after the inode pointer and any direct block pointers
7419  * are valid or fully removed via truncate or frag extension.
7420  */
7421 static void
7422 free_newblk(newblk)
7423         struct newblk *newblk;
7424 {
7425         struct indirdep *indirdep;
7426         struct worklist *wk;
7427
7428         KASSERT(newblk->nb_jnewblk == NULL,
7429             ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7430         KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7431             ("free_newblk: unclaimed newblk"));
7432         LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7433         newblk_freefrag(newblk);
7434         if (newblk->nb_state & ONDEPLIST)
7435                 LIST_REMOVE(newblk, nb_deps);
7436         if (newblk->nb_state & ONWORKLIST)
7437                 WORKLIST_REMOVE(&newblk->nb_list);
7438         LIST_REMOVE(newblk, nb_hash);
7439         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7440                 free_newdirblk(WK_NEWDIRBLK(wk));
7441         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7442                 panic("free_newblk: extra newdirblk");
7443         while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7444                 indirdep_complete(indirdep);
7445         handle_jwork(&newblk->nb_jwork);
7446         WORKITEM_FREE(newblk, D_NEWBLK);
7447 }
7448
7449 /*
7450  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7451  * This routine must be called with splbio interrupts blocked.
7452  */
7453 static void
7454 free_newdirblk(newdirblk)
7455         struct newdirblk *newdirblk;
7456 {
7457         struct pagedep *pagedep;
7458         struct diradd *dap;
7459         struct worklist *wk;
7460
7461         LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7462         WORKLIST_REMOVE(&newdirblk->db_list);
7463         /*
7464          * If the pagedep is still linked onto the directory buffer
7465          * dependency chain, then some of the entries on the
7466          * pd_pendinghd list may not be committed to disk yet. In
7467          * this case, we will simply clear the NEWBLOCK flag and
7468          * let the pd_pendinghd list be processed when the pagedep
7469          * is next written. If the pagedep is no longer on the buffer
7470          * dependency chain, then all the entries on the pd_pending
7471          * list are committed to disk and we can free them here.
7472          */
7473         pagedep = newdirblk->db_pagedep;
7474         pagedep->pd_state &= ~NEWBLOCK;
7475         if ((pagedep->pd_state & ONWORKLIST) == 0) {
7476                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7477                         free_diradd(dap, NULL);
7478                 /*
7479                  * If no dependencies remain, the pagedep will be freed.
7480                  */
7481                 free_pagedep(pagedep);
7482         }
7483         /* Should only ever be one item in the list. */
7484         while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7485                 WORKLIST_REMOVE(wk);
7486                 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7487         }
7488         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7489 }
7490
7491 /*
7492  * Prepare an inode to be freed. The actual free operation is not
7493  * done until the zero'ed inode has been written to disk.
7494  */
7495 void
7496 softdep_freefile(pvp, ino, mode)
7497         struct vnode *pvp;
7498         ino_t ino;
7499         int mode;
7500 {
7501         struct inode *ip = VTOI(pvp);
7502         struct inodedep *inodedep;
7503         struct freefile *freefile;
7504         struct freeblks *freeblks;
7505         struct ufsmount *ump;
7506
7507         ump = ip->i_ump;
7508         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7509             ("softdep_freefile called on non-softdep filesystem"));
7510         /*
7511          * This sets up the inode de-allocation dependency.
7512          */
7513         freefile = malloc(sizeof(struct freefile),
7514                 M_FREEFILE, M_SOFTDEP_FLAGS);
7515         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7516         freefile->fx_mode = mode;
7517         freefile->fx_oldinum = ino;
7518         freefile->fx_devvp = ip->i_devvp;
7519         LIST_INIT(&freefile->fx_jwork);
7520         UFS_LOCK(ump);
7521         ip->i_fs->fs_pendinginodes += 1;
7522         UFS_UNLOCK(ump);
7523
7524         /*
7525          * If the inodedep does not exist, then the zero'ed inode has
7526          * been written to disk. If the allocated inode has never been
7527          * written to disk, then the on-disk inode is zero'ed. In either
7528          * case we can free the file immediately.  If the journal was
7529          * canceled before being written the inode will never make it to
7530          * disk and we must send the canceled journal entrys to
7531          * ffs_freefile() to be cleared in conjunction with the bitmap.
7532          * Any blocks waiting on the inode to write can be safely freed
7533          * here as it will never been written.
7534          */
7535         ACQUIRE_LOCK(ump);
7536         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7537         if (inodedep) {
7538                 /*
7539                  * Clear out freeblks that no longer need to reference
7540                  * this inode.
7541                  */
7542                 while ((freeblks =
7543                     TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7544                         TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7545                             fb_next);
7546                         freeblks->fb_state &= ~ONDEPLIST;
7547                 }
7548                 /*
7549                  * Remove this inode from the unlinked list.
7550                  */
7551                 if (inodedep->id_state & UNLINKED) {
7552                         /*
7553                          * Save the journal work to be freed with the bitmap
7554                          * before we clear UNLINKED.  Otherwise it can be lost
7555                          * if the inode block is written.
7556                          */
7557                         handle_bufwait(inodedep, &freefile->fx_jwork);
7558                         clear_unlinked_inodedep(inodedep);
7559                         /*
7560                          * Re-acquire inodedep as we've dropped the
7561                          * per-filesystem lock in clear_unlinked_inodedep().
7562                          */
7563                         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7564                 }
7565         }
7566         if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7567                 FREE_LOCK(ump);
7568                 handle_workitem_freefile(freefile);
7569                 return;
7570         }
7571         if ((inodedep->id_state & DEPCOMPLETE) == 0)
7572                 inodedep->id_state |= GOINGAWAY;
7573         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7574         FREE_LOCK(ump);
7575         if (ip->i_number == ino)
7576                 ip->i_flag |= IN_MODIFIED;
7577 }
7578
7579 /*
7580  * Check to see if an inode has never been written to disk. If
7581  * so free the inodedep and return success, otherwise return failure.
7582  * This routine must be called with splbio interrupts blocked.
7583  *
7584  * If we still have a bitmap dependency, then the inode has never
7585  * been written to disk. Drop the dependency as it is no longer
7586  * necessary since the inode is being deallocated. We set the
7587  * ALLCOMPLETE flags since the bitmap now properly shows that the
7588  * inode is not allocated. Even if the inode is actively being
7589  * written, it has been rolled back to its zero'ed state, so we
7590  * are ensured that a zero inode is what is on the disk. For short
7591  * lived files, this change will usually result in removing all the
7592  * dependencies from the inode so that it can be freed immediately.
7593  */
7594 static int
7595 check_inode_unwritten(inodedep)
7596         struct inodedep *inodedep;
7597 {
7598
7599         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7600
7601         if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7602             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7603             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7604             !LIST_EMPTY(&inodedep->id_bufwait) ||
7605             !LIST_EMPTY(&inodedep->id_inowait) ||
7606             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7607             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7608             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7609             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7610             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7611             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7612             inodedep->id_mkdiradd != NULL ||
7613             inodedep->id_nlinkdelta != 0)
7614                 return (0);
7615         /*
7616          * Another process might be in initiate_write_inodeblock_ufs[12]
7617          * trying to allocate memory without holding "Softdep Lock".
7618          */
7619         if ((inodedep->id_state & IOSTARTED) != 0 &&
7620             inodedep->id_savedino1 == NULL)
7621                 return (0);
7622
7623         if (inodedep->id_state & ONDEPLIST)
7624                 LIST_REMOVE(inodedep, id_deps);
7625         inodedep->id_state &= ~ONDEPLIST;
7626         inodedep->id_state |= ALLCOMPLETE;
7627         inodedep->id_bmsafemap = NULL;
7628         if (inodedep->id_state & ONWORKLIST)
7629                 WORKLIST_REMOVE(&inodedep->id_list);
7630         if (inodedep->id_savedino1 != NULL) {
7631                 free(inodedep->id_savedino1, M_SAVEDINO);
7632                 inodedep->id_savedino1 = NULL;
7633         }
7634         if (free_inodedep(inodedep) == 0)
7635                 panic("check_inode_unwritten: busy inode");
7636         return (1);
7637 }
7638
7639 static int
7640 check_inodedep_free(inodedep)
7641         struct inodedep *inodedep;
7642 {
7643
7644         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7645         if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7646             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7647             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7648             !LIST_EMPTY(&inodedep->id_bufwait) ||
7649             !LIST_EMPTY(&inodedep->id_inowait) ||
7650             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7651             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7652             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7653             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7654             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7655             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7656             inodedep->id_mkdiradd != NULL ||
7657             inodedep->id_nlinkdelta != 0 ||
7658             inodedep->id_savedino1 != NULL)
7659                 return (0);
7660         return (1);
7661 }
7662
7663 /*
7664  * Try to free an inodedep structure. Return 1 if it could be freed.
7665  */
7666 static int
7667 free_inodedep(inodedep)
7668         struct inodedep *inodedep;
7669 {
7670
7671         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7672         if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7673             !check_inodedep_free(inodedep))
7674                 return (0);
7675         if (inodedep->id_state & ONDEPLIST)
7676                 LIST_REMOVE(inodedep, id_deps);
7677         LIST_REMOVE(inodedep, id_hash);
7678         WORKITEM_FREE(inodedep, D_INODEDEP);
7679         return (1);
7680 }
7681
7682 /*
7683  * Free the block referenced by a freework structure.  The parent freeblks
7684  * structure is released and completed when the final cg bitmap reaches
7685  * the disk.  This routine may be freeing a jnewblk which never made it to
7686  * disk in which case we do not have to wait as the operation is undone
7687  * in memory immediately.
7688  */
7689 static void
7690 freework_freeblock(freework)
7691         struct freework *freework;
7692 {
7693         struct freeblks *freeblks;
7694         struct jnewblk *jnewblk;
7695         struct ufsmount *ump;
7696         struct workhead wkhd;
7697         struct fs *fs;
7698         int bsize;
7699         int needj;
7700
7701         ump = VFSTOUFS(freework->fw_list.wk_mp);
7702         LOCK_OWNED(ump);
7703         /*
7704          * Handle partial truncate separately.
7705          */
7706         if (freework->fw_indir) {
7707                 complete_trunc_indir(freework);
7708                 return;
7709         }
7710         freeblks = freework->fw_freeblks;
7711         fs = ump->um_fs;
7712         needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7713         bsize = lfragtosize(fs, freework->fw_frags);
7714         LIST_INIT(&wkhd);
7715         /*
7716          * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7717          * on the indirblk hashtable and prevents premature freeing.
7718          */
7719         freework->fw_state |= DEPCOMPLETE;
7720         /*
7721          * SUJ needs to wait for the segment referencing freed indirect
7722          * blocks to expire so that we know the checker will not confuse
7723          * a re-allocated indirect block with its old contents.
7724          */
7725         if (needj && freework->fw_lbn <= -NDADDR)
7726                 indirblk_insert(freework);
7727         /*
7728          * If we are canceling an existing jnewblk pass it to the free
7729          * routine, otherwise pass the freeblk which will ultimately
7730          * release the freeblks.  If we're not journaling, we can just
7731          * free the freeblks immediately.
7732          */
7733         jnewblk = freework->fw_jnewblk;
7734         if (jnewblk != NULL) {
7735                 cancel_jnewblk(jnewblk, &wkhd);
7736                 needj = 0;
7737         } else if (needj) {
7738                 freework->fw_state |= DELAYEDFREE;
7739                 freeblks->fb_cgwait++;
7740                 WORKLIST_INSERT(&wkhd, &freework->fw_list);
7741         }
7742         FREE_LOCK(ump);
7743         freeblks_free(ump, freeblks, btodb(bsize));
7744         CTR4(KTR_SUJ,
7745             "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7746             freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7747         ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7748             freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7749         ACQUIRE_LOCK(ump);
7750         /*
7751          * The jnewblk will be discarded and the bits in the map never
7752          * made it to disk.  We can immediately free the freeblk.
7753          */
7754         if (needj == 0)
7755                 handle_written_freework(freework);
7756 }
7757
7758 /*
7759  * We enqueue freework items that need processing back on the freeblks and
7760  * add the freeblks to the worklist.  This makes it easier to find all work
7761  * required to flush a truncation in process_truncates().
7762  */
7763 static void
7764 freework_enqueue(freework)
7765         struct freework *freework;
7766 {
7767         struct freeblks *freeblks;
7768
7769         freeblks = freework->fw_freeblks;
7770         if ((freework->fw_state & INPROGRESS) == 0)
7771                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7772         if ((freeblks->fb_state &
7773             (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7774             LIST_EMPTY(&freeblks->fb_jblkdephd))
7775                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7776 }
7777
7778 /*
7779  * Start, continue, or finish the process of freeing an indirect block tree.
7780  * The free operation may be paused at any point with fw_off containing the
7781  * offset to restart from.  This enables us to implement some flow control
7782  * for large truncates which may fan out and generate a huge number of
7783  * dependencies.
7784  */
7785 static void
7786 handle_workitem_indirblk(freework)
7787         struct freework *freework;
7788 {
7789         struct freeblks *freeblks;
7790         struct ufsmount *ump;
7791         struct fs *fs;
7792
7793         freeblks = freework->fw_freeblks;
7794         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7795         fs = ump->um_fs;
7796         if (freework->fw_state & DEPCOMPLETE) {
7797                 handle_written_freework(freework);
7798                 return;
7799         }
7800         if (freework->fw_off == NINDIR(fs)) {
7801                 freework_freeblock(freework);
7802                 return;
7803         }
7804         freework->fw_state |= INPROGRESS;
7805         FREE_LOCK(ump);
7806         indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7807             freework->fw_lbn);
7808         ACQUIRE_LOCK(ump);
7809 }
7810
7811 /*
7812  * Called when a freework structure attached to a cg buf is written.  The
7813  * ref on either the parent or the freeblks structure is released and
7814  * the freeblks is added back to the worklist if there is more work to do.
7815  */
7816 static void
7817 handle_written_freework(freework)
7818         struct freework *freework;
7819 {
7820         struct freeblks *freeblks;
7821         struct freework *parent;
7822
7823         freeblks = freework->fw_freeblks;
7824         parent = freework->fw_parent;
7825         if (freework->fw_state & DELAYEDFREE)
7826                 freeblks->fb_cgwait--;
7827         freework->fw_state |= COMPLETE;
7828         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7829                 WORKITEM_FREE(freework, D_FREEWORK);
7830         if (parent) {
7831                 if (--parent->fw_ref == 0)
7832                         freework_enqueue(parent);
7833                 return;
7834         }
7835         if (--freeblks->fb_ref != 0)
7836                 return;
7837         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7838             ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7839                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7840 }
7841
7842 /*
7843  * This workitem routine performs the block de-allocation.
7844  * The workitem is added to the pending list after the updated
7845  * inode block has been written to disk.  As mentioned above,
7846  * checks regarding the number of blocks de-allocated (compared
7847  * to the number of blocks allocated for the file) are also
7848  * performed in this function.
7849  */
7850 static int
7851 handle_workitem_freeblocks(freeblks, flags)
7852         struct freeblks *freeblks;
7853         int flags;
7854 {
7855         struct freework *freework;
7856         struct newblk *newblk;
7857         struct allocindir *aip;
7858         struct ufsmount *ump;
7859         struct worklist *wk;
7860
7861         KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7862             ("handle_workitem_freeblocks: Journal entries not written."));
7863         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7864         ACQUIRE_LOCK(ump);
7865         while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7866                 WORKLIST_REMOVE(wk);
7867                 switch (wk->wk_type) {
7868                 case D_DIRREM:
7869                         wk->wk_state |= COMPLETE;
7870                         add_to_worklist(wk, 0);
7871                         continue;
7872
7873                 case D_ALLOCDIRECT:
7874                         free_newblk(WK_NEWBLK(wk));
7875                         continue;
7876
7877                 case D_ALLOCINDIR:
7878                         aip = WK_ALLOCINDIR(wk);
7879                         freework = NULL;
7880                         if (aip->ai_state & DELAYEDFREE) {
7881                                 FREE_LOCK(ump);
7882                                 freework = newfreework(ump, freeblks, NULL,
7883                                     aip->ai_lbn, aip->ai_newblkno,
7884                                     ump->um_fs->fs_frag, 0, 0);
7885                                 ACQUIRE_LOCK(ump);
7886                         }
7887                         newblk = WK_NEWBLK(wk);
7888                         if (newblk->nb_jnewblk) {
7889                                 freework->fw_jnewblk = newblk->nb_jnewblk;
7890                                 newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7891                                 newblk->nb_jnewblk = NULL;
7892                         }
7893                         free_newblk(newblk);
7894                         continue;
7895
7896                 case D_FREEWORK:
7897                         freework = WK_FREEWORK(wk);
7898                         if (freework->fw_lbn <= -NDADDR)
7899                                 handle_workitem_indirblk(freework);
7900                         else
7901                                 freework_freeblock(freework);
7902                         continue;
7903                 default:
7904                         panic("handle_workitem_freeblocks: Unknown type %s",
7905                             TYPENAME(wk->wk_type));
7906                 }
7907         }
7908         if (freeblks->fb_ref != 0) {
7909                 freeblks->fb_state &= ~INPROGRESS;
7910                 wake_worklist(&freeblks->fb_list);
7911                 freeblks = NULL;
7912         }
7913         FREE_LOCK(ump);
7914         if (freeblks)
7915                 return handle_complete_freeblocks(freeblks, flags);
7916         return (0);
7917 }
7918
7919 /*
7920  * Handle completion of block free via truncate.  This allows fs_pending
7921  * to track the actual free block count more closely than if we only updated
7922  * it at the end.  We must be careful to handle cases where the block count
7923  * on free was incorrect.
7924  */
7925 static void
7926 freeblks_free(ump, freeblks, blocks)
7927         struct ufsmount *ump;
7928         struct freeblks *freeblks;
7929         int blocks;
7930 {
7931         struct fs *fs;
7932         ufs2_daddr_t remain;
7933
7934         UFS_LOCK(ump);
7935         remain = -freeblks->fb_chkcnt;
7936         freeblks->fb_chkcnt += blocks;
7937         if (remain > 0) {
7938                 if (remain < blocks)
7939                         blocks = remain;
7940                 fs = ump->um_fs;
7941                 fs->fs_pendingblocks -= blocks;
7942         }
7943         UFS_UNLOCK(ump);
7944 }
7945
7946 /*
7947  * Once all of the freework workitems are complete we can retire the
7948  * freeblocks dependency and any journal work awaiting completion.  This
7949  * can not be called until all other dependencies are stable on disk.
7950  */
7951 static int
7952 handle_complete_freeblocks(freeblks, flags)
7953         struct freeblks *freeblks;
7954         int flags;
7955 {
7956         struct inodedep *inodedep;
7957         struct inode *ip;
7958         struct vnode *vp;
7959         struct fs *fs;
7960         struct ufsmount *ump;
7961         ufs2_daddr_t spare;
7962
7963         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7964         fs = ump->um_fs;
7965         flags = LK_EXCLUSIVE | flags;
7966         spare = freeblks->fb_chkcnt;
7967
7968         /*
7969          * If we did not release the expected number of blocks we may have
7970          * to adjust the inode block count here.  Only do so if it wasn't
7971          * a truncation to zero and the modrev still matches.
7972          */
7973         if (spare && freeblks->fb_len != 0) {
7974                 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7975                     flags, &vp, FFSV_FORCEINSMQ) != 0)
7976                         return (EBUSY);
7977                 ip = VTOI(vp);
7978                 if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7979                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7980                         ip->i_flag |= IN_CHANGE;
7981                         /*
7982                          * We must wait so this happens before the
7983                          * journal is reclaimed.
7984                          */
7985                         ffs_update(vp, 1);
7986                 }
7987                 vput(vp);
7988         }
7989         if (spare < 0) {
7990                 UFS_LOCK(ump);
7991                 fs->fs_pendingblocks += spare;
7992                 UFS_UNLOCK(ump);
7993         }
7994 #ifdef QUOTA
7995         /* Handle spare. */
7996         if (spare)
7997                 quotaadj(freeblks->fb_quota, ump, -spare);
7998         quotarele(freeblks->fb_quota);
7999 #endif
8000         ACQUIRE_LOCK(ump);
8001         if (freeblks->fb_state & ONDEPLIST) {
8002                 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8003                     0, &inodedep);
8004                 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8005                 freeblks->fb_state &= ~ONDEPLIST;
8006                 if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8007                         free_inodedep(inodedep);
8008         }
8009         /*
8010          * All of the freeblock deps must be complete prior to this call
8011          * so it's now safe to complete earlier outstanding journal entries.
8012          */
8013         handle_jwork(&freeblks->fb_jwork);
8014         WORKITEM_FREE(freeblks, D_FREEBLKS);
8015         FREE_LOCK(ump);
8016         return (0);
8017 }
8018
8019 /*
8020  * Release blocks associated with the freeblks and stored in the indirect
8021  * block dbn. If level is greater than SINGLE, the block is an indirect block
8022  * and recursive calls to indirtrunc must be used to cleanse other indirect
8023  * blocks.
8024  *
8025  * This handles partial and complete truncation of blocks.  Partial is noted
8026  * with goingaway == 0.  In this case the freework is completed after the
8027  * zero'd indirects are written to disk.  For full truncation the freework
8028  * is completed after the block is freed.
8029  */
8030 static void
8031 indir_trunc(freework, dbn, lbn)
8032         struct freework *freework;
8033         ufs2_daddr_t dbn;
8034         ufs_lbn_t lbn;
8035 {
8036         struct freework *nfreework;
8037         struct workhead wkhd;
8038         struct freeblks *freeblks;
8039         struct buf *bp;
8040         struct fs *fs;
8041         struct indirdep *indirdep;
8042         struct ufsmount *ump;
8043         ufs1_daddr_t *bap1;
8044         ufs2_daddr_t nb, nnb, *bap2;
8045         ufs_lbn_t lbnadd, nlbn;
8046         int i, nblocks, ufs1fmt;
8047         int freedblocks;
8048         int goingaway;
8049         int freedeps;
8050         int needj;
8051         int level;
8052         int cnt;
8053
8054         freeblks = freework->fw_freeblks;
8055         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8056         fs = ump->um_fs;
8057         /*
8058          * Get buffer of block pointers to be freed.  There are three cases:
8059          *
8060          * 1) Partial truncate caches the indirdep pointer in the freework
8061          *    which provides us a back copy to the save bp which holds the
8062          *    pointers we want to clear.  When this completes the zero
8063          *    pointers are written to the real copy.
8064          * 2) The indirect is being completely truncated, cancel_indirdep()
8065          *    eliminated the real copy and placed the indirdep on the saved
8066          *    copy.  The indirdep and buf are discarded when this completes.
8067          * 3) The indirect was not in memory, we read a copy off of the disk
8068          *    using the devvp and drop and invalidate the buffer when we're
8069          *    done.
8070          */
8071         goingaway = 1;
8072         indirdep = NULL;
8073         if (freework->fw_indir != NULL) {
8074                 goingaway = 0;
8075                 indirdep = freework->fw_indir;
8076                 bp = indirdep->ir_savebp;
8077                 if (bp == NULL || bp->b_blkno != dbn)
8078                         panic("indir_trunc: Bad saved buf %p blkno %jd",
8079                             bp, (intmax_t)dbn);
8080         } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8081                 /*
8082                  * The lock prevents the buf dep list from changing and
8083                  * indirects on devvp should only ever have one dependency.
8084                  */
8085                 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8086                 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8087                         panic("indir_trunc: Bad indirdep %p from buf %p",
8088                             indirdep, bp);
8089         } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8090             NOCRED, &bp) != 0) {
8091                 brelse(bp);
8092                 return;
8093         }
8094         ACQUIRE_LOCK(ump);
8095         /* Protects against a race with complete_trunc_indir(). */
8096         freework->fw_state &= ~INPROGRESS;
8097         /*
8098          * If we have an indirdep we need to enforce the truncation order
8099          * and discard it when it is complete.
8100          */
8101         if (indirdep) {
8102                 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8103                     !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8104                         /*
8105                          * Add the complete truncate to the list on the
8106                          * indirdep to enforce in-order processing.
8107                          */
8108                         if (freework->fw_indir == NULL)
8109                                 TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8110                                     freework, fw_next);
8111                         FREE_LOCK(ump);
8112                         return;
8113                 }
8114                 /*
8115                  * If we're goingaway, free the indirdep.  Otherwise it will
8116                  * linger until the write completes.
8117                  */
8118                 if (goingaway)
8119                         free_indirdep(indirdep);
8120         }
8121         FREE_LOCK(ump);
8122         /* Initialize pointers depending on block size. */
8123         if (ump->um_fstype == UFS1) {
8124                 bap1 = (ufs1_daddr_t *)bp->b_data;
8125                 nb = bap1[freework->fw_off];
8126                 ufs1fmt = 1;
8127                 bap2 = NULL;
8128         } else {
8129                 bap2 = (ufs2_daddr_t *)bp->b_data;
8130                 nb = bap2[freework->fw_off];
8131                 ufs1fmt = 0;
8132                 bap1 = NULL;
8133         }
8134         level = lbn_level(lbn);
8135         needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8136         lbnadd = lbn_offset(fs, level);
8137         nblocks = btodb(fs->fs_bsize);
8138         nfreework = freework;
8139         freedeps = 0;
8140         cnt = 0;
8141         /*
8142          * Reclaim blocks.  Traverses into nested indirect levels and
8143          * arranges for the current level to be freed when subordinates
8144          * are free when journaling.
8145          */
8146         for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8147                 if (i != NINDIR(fs) - 1) {
8148                         if (ufs1fmt)
8149                                 nnb = bap1[i+1];
8150                         else
8151                                 nnb = bap2[i+1];
8152                 } else
8153                         nnb = 0;
8154                 if (nb == 0)
8155                         continue;
8156                 cnt++;
8157                 if (level != 0) {
8158                         nlbn = (lbn + 1) - (i * lbnadd);
8159                         if (needj != 0) {
8160                                 nfreework = newfreework(ump, freeblks, freework,
8161                                     nlbn, nb, fs->fs_frag, 0, 0);
8162                                 freedeps++;
8163                         }
8164                         indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8165                 } else {
8166                         struct freedep *freedep;
8167
8168                         /*
8169                          * Attempt to aggregate freedep dependencies for
8170                          * all blocks being released to the same CG.
8171                          */
8172                         LIST_INIT(&wkhd);
8173                         if (needj != 0 &&
8174                             (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8175                                 freedep = newfreedep(freework);
8176                                 WORKLIST_INSERT_UNLOCKED(&wkhd,
8177                                     &freedep->fd_list);
8178                                 freedeps++;
8179                         }
8180                         CTR3(KTR_SUJ,
8181                             "indir_trunc: ino %d blkno %jd size %ld",
8182                             freeblks->fb_inum, nb, fs->fs_bsize);
8183                         ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8184                             fs->fs_bsize, freeblks->fb_inum,
8185                             freeblks->fb_vtype, &wkhd);
8186                 }
8187         }
8188         if (goingaway) {
8189                 bp->b_flags |= B_INVAL | B_NOCACHE;
8190                 brelse(bp);
8191         }
8192         freedblocks = 0;
8193         if (level == 0)
8194                 freedblocks = (nblocks * cnt);
8195         if (needj == 0)
8196                 freedblocks += nblocks;
8197         freeblks_free(ump, freeblks, freedblocks);
8198         /*
8199          * If we are journaling set up the ref counts and offset so this
8200          * indirect can be completed when its children are free.
8201          */
8202         if (needj) {
8203                 ACQUIRE_LOCK(ump);
8204                 freework->fw_off = i;
8205                 freework->fw_ref += freedeps;
8206                 freework->fw_ref -= NINDIR(fs) + 1;
8207                 if (level == 0)
8208                         freeblks->fb_cgwait += freedeps;
8209                 if (freework->fw_ref == 0)
8210                         freework_freeblock(freework);
8211                 FREE_LOCK(ump);
8212                 return;
8213         }
8214         /*
8215          * If we're not journaling we can free the indirect now.
8216          */
8217         dbn = dbtofsb(fs, dbn);
8218         CTR3(KTR_SUJ,
8219             "indir_trunc 2: ino %d blkno %jd size %ld",
8220             freeblks->fb_inum, dbn, fs->fs_bsize);
8221         ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8222             freeblks->fb_inum, freeblks->fb_vtype, NULL);
8223         /* Non SUJ softdep does single-threaded truncations. */
8224         if (freework->fw_blkno == dbn) {
8225                 freework->fw_state |= ALLCOMPLETE;
8226                 ACQUIRE_LOCK(ump);
8227                 handle_written_freework(freework);
8228                 FREE_LOCK(ump);
8229         }
8230         return;
8231 }
8232
8233 /*
8234  * Cancel an allocindir when it is removed via truncation.  When bp is not
8235  * NULL the indirect never appeared on disk and is scheduled to be freed
8236  * independently of the indir so we can more easily track journal work.
8237  */
8238 static void
8239 cancel_allocindir(aip, bp, freeblks, trunc)
8240         struct allocindir *aip;
8241         struct buf *bp;
8242         struct freeblks *freeblks;
8243         int trunc;
8244 {
8245         struct indirdep *indirdep;
8246         struct freefrag *freefrag;
8247         struct newblk *newblk;
8248
8249         newblk = (struct newblk *)aip;
8250         LIST_REMOVE(aip, ai_next);
8251         /*
8252          * We must eliminate the pointer in bp if it must be freed on its
8253          * own due to partial truncate or pending journal work.
8254          */
8255         if (bp && (trunc || newblk->nb_jnewblk)) {
8256                 /*
8257                  * Clear the pointer and mark the aip to be freed
8258                  * directly if it never existed on disk.
8259                  */
8260                 aip->ai_state |= DELAYEDFREE;
8261                 indirdep = aip->ai_indirdep;
8262                 if (indirdep->ir_state & UFS1FMT)
8263                         ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8264                 else
8265                         ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8266         }
8267         /*
8268          * When truncating the previous pointer will be freed via
8269          * savedbp.  Eliminate the freefrag which would dup free.
8270          */
8271         if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8272                 newblk->nb_freefrag = NULL;
8273                 if (freefrag->ff_jdep)
8274                         cancel_jfreefrag(
8275                             WK_JFREEFRAG(freefrag->ff_jdep));
8276                 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8277                 WORKITEM_FREE(freefrag, D_FREEFRAG);
8278         }
8279         /*
8280          * If the journal hasn't been written the jnewblk must be passed
8281          * to the call to ffs_blkfree that reclaims the space.  We accomplish
8282          * this by leaving the journal dependency on the newblk to be freed
8283          * when a freework is created in handle_workitem_freeblocks().
8284          */
8285         cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8286         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8287 }
8288
8289 /*
8290  * Create the mkdir dependencies for . and .. in a new directory.  Link them
8291  * in to a newdirblk so any subsequent additions are tracked properly.  The
8292  * caller is responsible for adding the mkdir1 dependency to the journal
8293  * and updating id_mkdiradd.  This function returns with the per-filesystem
8294  * lock held.
8295  */
8296 static struct mkdir *
8297 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8298         struct diradd *dap;
8299         ino_t newinum;
8300         ino_t dinum;
8301         struct buf *newdirbp;
8302         struct mkdir **mkdirp;
8303 {
8304         struct newblk *newblk;
8305         struct pagedep *pagedep;
8306         struct inodedep *inodedep;
8307         struct newdirblk *newdirblk;
8308         struct mkdir *mkdir1, *mkdir2;
8309         struct worklist *wk;
8310         struct jaddref *jaddref;
8311         struct ufsmount *ump;
8312         struct mount *mp;
8313
8314         mp = dap->da_list.wk_mp;
8315         ump = VFSTOUFS(mp);
8316         newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8317             M_SOFTDEP_FLAGS);
8318         workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8319         LIST_INIT(&newdirblk->db_mkdir);
8320         mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8321         workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8322         mkdir1->md_state = ATTACHED | MKDIR_BODY;
8323         mkdir1->md_diradd = dap;
8324         mkdir1->md_jaddref = NULL;
8325         mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8326         workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8327         mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8328         mkdir2->md_diradd = dap;
8329         mkdir2->md_jaddref = NULL;
8330         if (MOUNTEDSUJ(mp) == 0) {
8331                 mkdir1->md_state |= DEPCOMPLETE;
8332                 mkdir2->md_state |= DEPCOMPLETE;
8333         }
8334         /*
8335          * Dependency on "." and ".." being written to disk.
8336          */
8337         mkdir1->md_buf = newdirbp;
8338         ACQUIRE_LOCK(VFSTOUFS(mp));
8339         LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8340         /*
8341          * We must link the pagedep, allocdirect, and newdirblk for
8342          * the initial file page so the pointer to the new directory
8343          * is not written until the directory contents are live and
8344          * any subsequent additions are not marked live until the
8345          * block is reachable via the inode.
8346          */
8347         if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8348                 panic("setup_newdir: lost pagedep");
8349         LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8350                 if (wk->wk_type == D_ALLOCDIRECT)
8351                         break;
8352         if (wk == NULL)
8353                 panic("setup_newdir: lost allocdirect");
8354         if (pagedep->pd_state & NEWBLOCK)
8355                 panic("setup_newdir: NEWBLOCK already set");
8356         newblk = WK_NEWBLK(wk);
8357         pagedep->pd_state |= NEWBLOCK;
8358         pagedep->pd_newdirblk = newdirblk;
8359         newdirblk->db_pagedep = pagedep;
8360         WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8361         WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8362         /*
8363          * Look up the inodedep for the parent directory so that we
8364          * can link mkdir2 into the pending dotdot jaddref or
8365          * the inode write if there is none.  If the inode is
8366          * ALLCOMPLETE and no jaddref is present all dependencies have
8367          * been satisfied and mkdir2 can be freed.
8368          */
8369         inodedep_lookup(mp, dinum, 0, &inodedep);
8370         if (MOUNTEDSUJ(mp)) {
8371                 if (inodedep == NULL)
8372                         panic("setup_newdir: Lost parent.");
8373                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8374                     inoreflst);
8375                 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8376                     (jaddref->ja_state & MKDIR_PARENT),
8377                     ("setup_newdir: bad dotdot jaddref %p", jaddref));
8378                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8379                 mkdir2->md_jaddref = jaddref;
8380                 jaddref->ja_mkdir = mkdir2;
8381         } else if (inodedep == NULL ||
8382             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8383                 dap->da_state &= ~MKDIR_PARENT;
8384                 WORKITEM_FREE(mkdir2, D_MKDIR);
8385                 mkdir2 = NULL;
8386         } else {
8387                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8388                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8389         }
8390         *mkdirp = mkdir2;
8391
8392         return (mkdir1);
8393 }
8394
8395 /*
8396  * Directory entry addition dependencies.
8397  *
8398  * When adding a new directory entry, the inode (with its incremented link
8399  * count) must be written to disk before the directory entry's pointer to it.
8400  * Also, if the inode is newly allocated, the corresponding freemap must be
8401  * updated (on disk) before the directory entry's pointer. These requirements
8402  * are met via undo/redo on the directory entry's pointer, which consists
8403  * simply of the inode number.
8404  *
8405  * As directory entries are added and deleted, the free space within a
8406  * directory block can become fragmented.  The ufs filesystem will compact
8407  * a fragmented directory block to make space for a new entry. When this
8408  * occurs, the offsets of previously added entries change. Any "diradd"
8409  * dependency structures corresponding to these entries must be updated with
8410  * the new offsets.
8411  */
8412
8413 /*
8414  * This routine is called after the in-memory inode's link
8415  * count has been incremented, but before the directory entry's
8416  * pointer to the inode has been set.
8417  */
8418 int
8419 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8420         struct buf *bp;         /* buffer containing directory block */
8421         struct inode *dp;       /* inode for directory */
8422         off_t diroffset;        /* offset of new entry in directory */
8423         ino_t newinum;          /* inode referenced by new directory entry */
8424         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
8425         int isnewblk;           /* entry is in a newly allocated block */
8426 {
8427         int offset;             /* offset of new entry within directory block */
8428         ufs_lbn_t lbn;          /* block in directory containing new entry */
8429         struct fs *fs;
8430         struct diradd *dap;
8431         struct newblk *newblk;
8432         struct pagedep *pagedep;
8433         struct inodedep *inodedep;
8434         struct newdirblk *newdirblk;
8435         struct mkdir *mkdir1, *mkdir2;
8436         struct jaddref *jaddref;
8437         struct ufsmount *ump;
8438         struct mount *mp;
8439         int isindir;
8440
8441         ump = dp->i_ump;
8442         mp = UFSTOVFS(ump);
8443         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8444             ("softdep_setup_directory_add called on non-softdep filesystem"));
8445         /*
8446          * Whiteouts have no dependencies.
8447          */
8448         if (newinum == WINO) {
8449                 if (newdirbp != NULL)
8450                         bdwrite(newdirbp);
8451                 return (0);
8452         }
8453         jaddref = NULL;
8454         mkdir1 = mkdir2 = NULL;
8455         fs = dp->i_fs;
8456         lbn = lblkno(fs, diroffset);
8457         offset = blkoff(fs, diroffset);
8458         dap = malloc(sizeof(struct diradd), M_DIRADD,
8459                 M_SOFTDEP_FLAGS|M_ZERO);
8460         workitem_alloc(&dap->da_list, D_DIRADD, mp);
8461         dap->da_offset = offset;
8462         dap->da_newinum = newinum;
8463         dap->da_state = ATTACHED;
8464         LIST_INIT(&dap->da_jwork);
8465         isindir = bp->b_lblkno >= NDADDR;
8466         newdirblk = NULL;
8467         if (isnewblk &&
8468             (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8469                 newdirblk = malloc(sizeof(struct newdirblk),
8470                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8471                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8472                 LIST_INIT(&newdirblk->db_mkdir);
8473         }
8474         /*
8475          * If we're creating a new directory setup the dependencies and set
8476          * the dap state to wait for them.  Otherwise it's COMPLETE and
8477          * we can move on.
8478          */
8479         if (newdirbp == NULL) {
8480                 dap->da_state |= DEPCOMPLETE;
8481                 ACQUIRE_LOCK(ump);
8482         } else {
8483                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8484                 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8485                     &mkdir2);
8486         }
8487         /*
8488          * Link into parent directory pagedep to await its being written.
8489          */
8490         pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8491 #ifdef DEBUG
8492         if (diradd_lookup(pagedep, offset) != NULL)
8493                 panic("softdep_setup_directory_add: %p already at off %d\n",
8494                     diradd_lookup(pagedep, offset), offset);
8495 #endif
8496         dap->da_pagedep = pagedep;
8497         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8498             da_pdlist);
8499         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8500         /*
8501          * If we're journaling, link the diradd into the jaddref so it
8502          * may be completed after the journal entry is written.  Otherwise,
8503          * link the diradd into its inodedep.  If the inode is not yet
8504          * written place it on the bufwait list, otherwise do the post-inode
8505          * write processing to put it on the id_pendinghd list.
8506          */
8507         if (MOUNTEDSUJ(mp)) {
8508                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8509                     inoreflst);
8510                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8511                     ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8512                 jaddref->ja_diroff = diroffset;
8513                 jaddref->ja_diradd = dap;
8514                 add_to_journal(&jaddref->ja_list);
8515         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8516                 diradd_inode_written(dap, inodedep);
8517         else
8518                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8519         /*
8520          * Add the journal entries for . and .. links now that the primary
8521          * link is written.
8522          */
8523         if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8524                 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8525                     inoreflst, if_deps);
8526                 KASSERT(jaddref != NULL &&
8527                     jaddref->ja_ino == jaddref->ja_parent &&
8528                     (jaddref->ja_state & MKDIR_BODY),
8529                     ("softdep_setup_directory_add: bad dot jaddref %p",
8530                     jaddref));
8531                 mkdir1->md_jaddref = jaddref;
8532                 jaddref->ja_mkdir = mkdir1;
8533                 /*
8534                  * It is important that the dotdot journal entry
8535                  * is added prior to the dot entry since dot writes
8536                  * both the dot and dotdot links.  These both must
8537                  * be added after the primary link for the journal
8538                  * to remain consistent.
8539                  */
8540                 add_to_journal(&mkdir2->md_jaddref->ja_list);
8541                 add_to_journal(&jaddref->ja_list);
8542         }
8543         /*
8544          * If we are adding a new directory remember this diradd so that if
8545          * we rename it we can keep the dot and dotdot dependencies.  If
8546          * we are adding a new name for an inode that has a mkdiradd we
8547          * must be in rename and we have to move the dot and dotdot
8548          * dependencies to this new name.  The old name is being orphaned
8549          * soon.
8550          */
8551         if (mkdir1 != NULL) {
8552                 if (inodedep->id_mkdiradd != NULL)
8553                         panic("softdep_setup_directory_add: Existing mkdir");
8554                 inodedep->id_mkdiradd = dap;
8555         } else if (inodedep->id_mkdiradd)
8556                 merge_diradd(inodedep, dap);
8557         if (newdirblk != NULL) {
8558                 /*
8559                  * There is nothing to do if we are already tracking
8560                  * this block.
8561                  */
8562                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
8563                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8564                         FREE_LOCK(ump);
8565                         return (0);
8566                 }
8567                 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8568                     == 0)
8569                         panic("softdep_setup_directory_add: lost entry");
8570                 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8571                 pagedep->pd_state |= NEWBLOCK;
8572                 pagedep->pd_newdirblk = newdirblk;
8573                 newdirblk->db_pagedep = pagedep;
8574                 FREE_LOCK(ump);
8575                 /*
8576                  * If we extended into an indirect signal direnter to sync.
8577                  */
8578                 if (isindir)
8579                         return (1);
8580                 return (0);
8581         }
8582         FREE_LOCK(ump);
8583         return (0);
8584 }
8585
8586 /*
8587  * This procedure is called to change the offset of a directory
8588  * entry when compacting a directory block which must be owned
8589  * exclusively by the caller. Note that the actual entry movement
8590  * must be done in this procedure to ensure that no I/O completions
8591  * occur while the move is in progress.
8592  */
8593 void
8594 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8595         struct buf *bp;         /* Buffer holding directory block. */
8596         struct inode *dp;       /* inode for directory */
8597         caddr_t base;           /* address of dp->i_offset */
8598         caddr_t oldloc;         /* address of old directory location */
8599         caddr_t newloc;         /* address of new directory location */
8600         int entrysize;          /* size of directory entry */
8601 {
8602         int offset, oldoffset, newoffset;
8603         struct pagedep *pagedep;
8604         struct jmvref *jmvref;
8605         struct diradd *dap;
8606         struct direct *de;
8607         struct mount *mp;
8608         ufs_lbn_t lbn;
8609         int flags;
8610
8611         mp = UFSTOVFS(dp->i_ump);
8612         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8613             ("softdep_change_directoryentry_offset called on "
8614              "non-softdep filesystem"));
8615         de = (struct direct *)oldloc;
8616         jmvref = NULL;
8617         flags = 0;
8618         /*
8619          * Moves are always journaled as it would be too complex to
8620          * determine if any affected adds or removes are present in the
8621          * journal.
8622          */
8623         if (MOUNTEDSUJ(mp)) {
8624                 flags = DEPALLOC;
8625                 jmvref = newjmvref(dp, de->d_ino,
8626                     dp->i_offset + (oldloc - base),
8627                     dp->i_offset + (newloc - base));
8628         }
8629         lbn = lblkno(dp->i_fs, dp->i_offset);
8630         offset = blkoff(dp->i_fs, dp->i_offset);
8631         oldoffset = offset + (oldloc - base);
8632         newoffset = offset + (newloc - base);
8633         ACQUIRE_LOCK(dp->i_ump);
8634         if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8635                 goto done;
8636         dap = diradd_lookup(pagedep, oldoffset);
8637         if (dap) {
8638                 dap->da_offset = newoffset;
8639                 newoffset = DIRADDHASH(newoffset);
8640                 oldoffset = DIRADDHASH(oldoffset);
8641                 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8642                     newoffset != oldoffset) {
8643                         LIST_REMOVE(dap, da_pdlist);
8644                         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8645                             dap, da_pdlist);
8646                 }
8647         }
8648 done:
8649         if (jmvref) {
8650                 jmvref->jm_pagedep = pagedep;
8651                 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8652                 add_to_journal(&jmvref->jm_list);
8653         }
8654         bcopy(oldloc, newloc, entrysize);
8655         FREE_LOCK(dp->i_ump);
8656 }
8657
8658 /*
8659  * Move the mkdir dependencies and journal work from one diradd to another
8660  * when renaming a directory.  The new name must depend on the mkdir deps
8661  * completing as the old name did.  Directories can only have one valid link
8662  * at a time so one must be canonical.
8663  */
8664 static void
8665 merge_diradd(inodedep, newdap)
8666         struct inodedep *inodedep;
8667         struct diradd *newdap;
8668 {
8669         struct diradd *olddap;
8670         struct mkdir *mkdir, *nextmd;
8671         struct ufsmount *ump;
8672         short state;
8673
8674         olddap = inodedep->id_mkdiradd;
8675         inodedep->id_mkdiradd = newdap;
8676         if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8677                 newdap->da_state &= ~DEPCOMPLETE;
8678                 ump = VFSTOUFS(inodedep->id_list.wk_mp);
8679                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8680                      mkdir = nextmd) {
8681                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8682                         if (mkdir->md_diradd != olddap)
8683                                 continue;
8684                         mkdir->md_diradd = newdap;
8685                         state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8686                         newdap->da_state |= state;
8687                         olddap->da_state &= ~state;
8688                         if ((olddap->da_state &
8689                             (MKDIR_PARENT | MKDIR_BODY)) == 0)
8690                                 break;
8691                 }
8692                 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8693                         panic("merge_diradd: unfound ref");
8694         }
8695         /*
8696          * Any mkdir related journal items are not safe to be freed until
8697          * the new name is stable.
8698          */
8699         jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8700         olddap->da_state |= DEPCOMPLETE;
8701         complete_diradd(olddap);
8702 }
8703
8704 /*
8705  * Move the diradd to the pending list when all diradd dependencies are
8706  * complete.
8707  */
8708 static void
8709 complete_diradd(dap)
8710         struct diradd *dap;
8711 {
8712         struct pagedep *pagedep;
8713
8714         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8715                 if (dap->da_state & DIRCHG)
8716                         pagedep = dap->da_previous->dm_pagedep;
8717                 else
8718                         pagedep = dap->da_pagedep;
8719                 LIST_REMOVE(dap, da_pdlist);
8720                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8721         }
8722 }
8723
8724 /*
8725  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8726  * add entries and conditonally journal the remove.
8727  */
8728 static void
8729 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8730         struct diradd *dap;
8731         struct dirrem *dirrem;
8732         struct jremref *jremref;
8733         struct jremref *dotremref;
8734         struct jremref *dotdotremref;
8735 {
8736         struct inodedep *inodedep;
8737         struct jaddref *jaddref;
8738         struct inoref *inoref;
8739         struct ufsmount *ump;
8740         struct mkdir *mkdir;
8741
8742         /*
8743          * If no remove references were allocated we're on a non-journaled
8744          * filesystem and can skip the cancel step.
8745          */
8746         if (jremref == NULL) {
8747                 free_diradd(dap, NULL);
8748                 return;
8749         }
8750         /*
8751          * Cancel the primary name an free it if it does not require
8752          * journaling.
8753          */
8754         if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8755             0, &inodedep) != 0) {
8756                 /* Abort the addref that reference this diradd.  */
8757                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8758                         if (inoref->if_list.wk_type != D_JADDREF)
8759                                 continue;
8760                         jaddref = (struct jaddref *)inoref;
8761                         if (jaddref->ja_diradd != dap)
8762                                 continue;
8763                         if (cancel_jaddref(jaddref, inodedep,
8764                             &dirrem->dm_jwork) == 0) {
8765                                 free_jremref(jremref);
8766                                 jremref = NULL;
8767                         }
8768                         break;
8769                 }
8770         }
8771         /*
8772          * Cancel subordinate names and free them if they do not require
8773          * journaling.
8774          */
8775         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8776                 ump = VFSTOUFS(dap->da_list.wk_mp);
8777                 LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8778                         if (mkdir->md_diradd != dap)
8779                                 continue;
8780                         if ((jaddref = mkdir->md_jaddref) == NULL)
8781                                 continue;
8782                         mkdir->md_jaddref = NULL;
8783                         if (mkdir->md_state & MKDIR_PARENT) {
8784                                 if (cancel_jaddref(jaddref, NULL,
8785                                     &dirrem->dm_jwork) == 0) {
8786                                         free_jremref(dotdotremref);
8787                                         dotdotremref = NULL;
8788                                 }
8789                         } else {
8790                                 if (cancel_jaddref(jaddref, inodedep,
8791                                     &dirrem->dm_jwork) == 0) {
8792                                         free_jremref(dotremref);
8793                                         dotremref = NULL;
8794                                 }
8795                         }
8796                 }
8797         }
8798
8799         if (jremref)
8800                 journal_jremref(dirrem, jremref, inodedep);
8801         if (dotremref)
8802                 journal_jremref(dirrem, dotremref, inodedep);
8803         if (dotdotremref)
8804                 journal_jremref(dirrem, dotdotremref, NULL);
8805         jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8806         free_diradd(dap, &dirrem->dm_jwork);
8807 }
8808
8809 /*
8810  * Free a diradd dependency structure. This routine must be called
8811  * with splbio interrupts blocked.
8812  */
8813 static void
8814 free_diradd(dap, wkhd)
8815         struct diradd *dap;
8816         struct workhead *wkhd;
8817 {
8818         struct dirrem *dirrem;
8819         struct pagedep *pagedep;
8820         struct inodedep *inodedep;
8821         struct mkdir *mkdir, *nextmd;
8822         struct ufsmount *ump;
8823
8824         ump = VFSTOUFS(dap->da_list.wk_mp);
8825         LOCK_OWNED(ump);
8826         LIST_REMOVE(dap, da_pdlist);
8827         if (dap->da_state & ONWORKLIST)
8828                 WORKLIST_REMOVE(&dap->da_list);
8829         if ((dap->da_state & DIRCHG) == 0) {
8830                 pagedep = dap->da_pagedep;
8831         } else {
8832                 dirrem = dap->da_previous;
8833                 pagedep = dirrem->dm_pagedep;
8834                 dirrem->dm_dirinum = pagedep->pd_ino;
8835                 dirrem->dm_state |= COMPLETE;
8836                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8837                         add_to_worklist(&dirrem->dm_list, 0);
8838         }
8839         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8840             0, &inodedep) != 0)
8841                 if (inodedep->id_mkdiradd == dap)
8842                         inodedep->id_mkdiradd = NULL;
8843         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8844                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8845                      mkdir = nextmd) {
8846                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8847                         if (mkdir->md_diradd != dap)
8848                                 continue;
8849                         dap->da_state &=
8850                             ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8851                         LIST_REMOVE(mkdir, md_mkdirs);
8852                         if (mkdir->md_state & ONWORKLIST)
8853                                 WORKLIST_REMOVE(&mkdir->md_list);
8854                         if (mkdir->md_jaddref != NULL)
8855                                 panic("free_diradd: Unexpected jaddref");
8856                         WORKITEM_FREE(mkdir, D_MKDIR);
8857                         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8858                                 break;
8859                 }
8860                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8861                         panic("free_diradd: unfound ref");
8862         }
8863         if (inodedep)
8864                 free_inodedep(inodedep);
8865         /*
8866          * Free any journal segments waiting for the directory write.
8867          */
8868         handle_jwork(&dap->da_jwork);
8869         WORKITEM_FREE(dap, D_DIRADD);
8870 }
8871
8872 /*
8873  * Directory entry removal dependencies.
8874  *
8875  * When removing a directory entry, the entry's inode pointer must be
8876  * zero'ed on disk before the corresponding inode's link count is decremented
8877  * (possibly freeing the inode for re-use). This dependency is handled by
8878  * updating the directory entry but delaying the inode count reduction until
8879  * after the directory block has been written to disk. After this point, the
8880  * inode count can be decremented whenever it is convenient.
8881  */
8882
8883 /*
8884  * This routine should be called immediately after removing
8885  * a directory entry.  The inode's link count should not be
8886  * decremented by the calling procedure -- the soft updates
8887  * code will do this task when it is safe.
8888  */
8889 void
8890 softdep_setup_remove(bp, dp, ip, isrmdir)
8891         struct buf *bp;         /* buffer containing directory block */
8892         struct inode *dp;       /* inode for the directory being modified */
8893         struct inode *ip;       /* inode for directory entry being removed */
8894         int isrmdir;            /* indicates if doing RMDIR */
8895 {
8896         struct dirrem *dirrem, *prevdirrem;
8897         struct inodedep *inodedep;
8898         int direct;
8899
8900         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
8901             ("softdep_setup_remove called on non-softdep filesystem"));
8902         /*
8903          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8904          * newdirrem() to setup the full directory remove which requires
8905          * isrmdir > 1.
8906          */
8907         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8908         /*
8909          * Add the dirrem to the inodedep's pending remove list for quick
8910          * discovery later.
8911          */
8912         if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8913             &inodedep) == 0)
8914                 panic("softdep_setup_remove: Lost inodedep.");
8915         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8916         dirrem->dm_state |= ONDEPLIST;
8917         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8918
8919         /*
8920          * If the COMPLETE flag is clear, then there were no active
8921          * entries and we want to roll back to a zeroed entry until
8922          * the new inode is committed to disk. If the COMPLETE flag is
8923          * set then we have deleted an entry that never made it to
8924          * disk. If the entry we deleted resulted from a name change,
8925          * then the old name still resides on disk. We cannot delete
8926          * its inode (returned to us in prevdirrem) until the zeroed
8927          * directory entry gets to disk. The new inode has never been
8928          * referenced on the disk, so can be deleted immediately.
8929          */
8930         if ((dirrem->dm_state & COMPLETE) == 0) {
8931                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8932                     dm_next);
8933                 FREE_LOCK(ip->i_ump);
8934         } else {
8935                 if (prevdirrem != NULL)
8936                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8937                             prevdirrem, dm_next);
8938                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8939                 direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8940                 FREE_LOCK(ip->i_ump);
8941                 if (direct)
8942                         handle_workitem_remove(dirrem, 0);
8943         }
8944 }
8945
8946 /*
8947  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8948  * pd_pendinghd list of a pagedep.
8949  */
8950 static struct diradd *
8951 diradd_lookup(pagedep, offset)
8952         struct pagedep *pagedep;
8953         int offset;
8954 {
8955         struct diradd *dap;
8956
8957         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8958                 if (dap->da_offset == offset)
8959                         return (dap);
8960         LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8961                 if (dap->da_offset == offset)
8962                         return (dap);
8963         return (NULL);
8964 }
8965
8966 /*
8967  * Search for a .. diradd dependency in a directory that is being removed.
8968  * If the directory was renamed to a new parent we have a diradd rather
8969  * than a mkdir for the .. entry.  We need to cancel it now before
8970  * it is found in truncate().
8971  */
8972 static struct jremref *
8973 cancel_diradd_dotdot(ip, dirrem, jremref)
8974         struct inode *ip;
8975         struct dirrem *dirrem;
8976         struct jremref *jremref;
8977 {
8978         struct pagedep *pagedep;
8979         struct diradd *dap;
8980         struct worklist *wk;
8981
8982         if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8983             &pagedep) == 0)
8984                 return (jremref);
8985         dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8986         if (dap == NULL)
8987                 return (jremref);
8988         cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8989         /*
8990          * Mark any journal work as belonging to the parent so it is freed
8991          * with the .. reference.
8992          */
8993         LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8994                 wk->wk_state |= MKDIR_PARENT;
8995         return (NULL);
8996 }
8997
8998 /*
8999  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9000  * replace it with a dirrem/diradd pair as a result of re-parenting a
9001  * directory.  This ensures that we don't simultaneously have a mkdir and
9002  * a diradd for the same .. entry.
9003  */
9004 static struct jremref *
9005 cancel_mkdir_dotdot(ip, dirrem, jremref)
9006         struct inode *ip;
9007         struct dirrem *dirrem;
9008         struct jremref *jremref;
9009 {
9010         struct inodedep *inodedep;
9011         struct jaddref *jaddref;
9012         struct ufsmount *ump;
9013         struct mkdir *mkdir;
9014         struct diradd *dap;
9015
9016         if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9017             &inodedep) == 0)
9018                 return (jremref);
9019         dap = inodedep->id_mkdiradd;
9020         if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9021                 return (jremref);
9022         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9023         for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9024             mkdir = LIST_NEXT(mkdir, md_mkdirs))
9025                 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9026                         break;
9027         if (mkdir == NULL)
9028                 panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9029         if ((jaddref = mkdir->md_jaddref) != NULL) {
9030                 mkdir->md_jaddref = NULL;
9031                 jaddref->ja_state &= ~MKDIR_PARENT;
9032                 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
9033                     &inodedep) == 0)
9034                         panic("cancel_mkdir_dotdot: Lost parent inodedep");
9035                 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9036                         journal_jremref(dirrem, jremref, inodedep);
9037                         jremref = NULL;
9038                 }
9039         }
9040         if (mkdir->md_state & ONWORKLIST)
9041                 WORKLIST_REMOVE(&mkdir->md_list);
9042         mkdir->md_state |= ALLCOMPLETE;
9043         complete_mkdir(mkdir);
9044         return (jremref);
9045 }
9046
9047 static void
9048 journal_jremref(dirrem, jremref, inodedep)
9049         struct dirrem *dirrem;
9050         struct jremref *jremref;
9051         struct inodedep *inodedep;
9052 {
9053
9054         if (inodedep == NULL)
9055                 if (inodedep_lookup(jremref->jr_list.wk_mp,
9056                     jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9057                         panic("journal_jremref: Lost inodedep");
9058         LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9059         TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9060         add_to_journal(&jremref->jr_list);
9061 }
9062
9063 static void
9064 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9065         struct dirrem *dirrem;
9066         struct jremref *jremref;
9067         struct jremref *dotremref;
9068         struct jremref *dotdotremref;
9069 {
9070         struct inodedep *inodedep;
9071
9072
9073         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9074             &inodedep) == 0)
9075                 panic("dirrem_journal: Lost inodedep");
9076         journal_jremref(dirrem, jremref, inodedep);
9077         if (dotremref)
9078                 journal_jremref(dirrem, dotremref, inodedep);
9079         if (dotdotremref)
9080                 journal_jremref(dirrem, dotdotremref, NULL);
9081 }
9082
9083 /*
9084  * Allocate a new dirrem if appropriate and return it along with
9085  * its associated pagedep. Called without a lock, returns with lock.
9086  */
9087 static struct dirrem *
9088 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9089         struct buf *bp;         /* buffer containing directory block */
9090         struct inode *dp;       /* inode for the directory being modified */
9091         struct inode *ip;       /* inode for directory entry being removed */
9092         int isrmdir;            /* indicates if doing RMDIR */
9093         struct dirrem **prevdirremp; /* previously referenced inode, if any */
9094 {
9095         int offset;
9096         ufs_lbn_t lbn;
9097         struct diradd *dap;
9098         struct dirrem *dirrem;
9099         struct pagedep *pagedep;
9100         struct jremref *jremref;
9101         struct jremref *dotremref;
9102         struct jremref *dotdotremref;
9103         struct vnode *dvp;
9104
9105         /*
9106          * Whiteouts have no deletion dependencies.
9107          */
9108         if (ip == NULL)
9109                 panic("newdirrem: whiteout");
9110         dvp = ITOV(dp);
9111         /*
9112          * If the system is over its limit and our filesystem is
9113          * responsible for more than our share of that usage and
9114          * we are not a snapshot, request some inodedep cleanup.
9115          * Limiting the number of dirrem structures will also limit
9116          * the number of freefile and freeblks structures.
9117          */
9118         ACQUIRE_LOCK(ip->i_ump);
9119         if (!IS_SNAPSHOT(ip) && softdep_excess_items(ip->i_ump, D_DIRREM))
9120                 schedule_cleanup(ITOV(dp)->v_mount);
9121         else
9122                 FREE_LOCK(ip->i_ump);
9123         dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9124             M_ZERO);
9125         workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9126         LIST_INIT(&dirrem->dm_jremrefhd);
9127         LIST_INIT(&dirrem->dm_jwork);
9128         dirrem->dm_state = isrmdir ? RMDIR : 0;
9129         dirrem->dm_oldinum = ip->i_number;
9130         *prevdirremp = NULL;
9131         /*
9132          * Allocate remove reference structures to track journal write
9133          * dependencies.  We will always have one for the link and
9134          * when doing directories we will always have one more for dot.
9135          * When renaming a directory we skip the dotdot link change so
9136          * this is not needed.
9137          */
9138         jremref = dotremref = dotdotremref = NULL;
9139         if (DOINGSUJ(dvp)) {
9140                 if (isrmdir) {
9141                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9142                             ip->i_effnlink + 2);
9143                         dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9144                             ip->i_effnlink + 1);
9145                         dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9146                             dp->i_effnlink + 1);
9147                         dotdotremref->jr_state |= MKDIR_PARENT;
9148                 } else
9149                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9150                             ip->i_effnlink + 1);
9151         }
9152         ACQUIRE_LOCK(ip->i_ump);
9153         lbn = lblkno(dp->i_fs, dp->i_offset);
9154         offset = blkoff(dp->i_fs, dp->i_offset);
9155         pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
9156             &pagedep);
9157         dirrem->dm_pagedep = pagedep;
9158         dirrem->dm_offset = offset;
9159         /*
9160          * If we're renaming a .. link to a new directory, cancel any
9161          * existing MKDIR_PARENT mkdir.  If it has already been canceled
9162          * the jremref is preserved for any potential diradd in this
9163          * location.  This can not coincide with a rmdir.
9164          */
9165         if (dp->i_offset == DOTDOT_OFFSET) {
9166                 if (isrmdir)
9167                         panic("newdirrem: .. directory change during remove?");
9168                 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9169         }
9170         /*
9171          * If we're removing a directory search for the .. dependency now and
9172          * cancel it.  Any pending journal work will be added to the dirrem
9173          * to be completed when the workitem remove completes.
9174          */
9175         if (isrmdir)
9176                 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9177         /*
9178          * Check for a diradd dependency for the same directory entry.
9179          * If present, then both dependencies become obsolete and can
9180          * be de-allocated.
9181          */
9182         dap = diradd_lookup(pagedep, offset);
9183         if (dap == NULL) {
9184                 /*
9185                  * Link the jremref structures into the dirrem so they are
9186                  * written prior to the pagedep.
9187                  */
9188                 if (jremref)
9189                         dirrem_journal(dirrem, jremref, dotremref,
9190                             dotdotremref);
9191                 return (dirrem);
9192         }
9193         /*
9194          * Must be ATTACHED at this point.
9195          */
9196         if ((dap->da_state & ATTACHED) == 0)
9197                 panic("newdirrem: not ATTACHED");
9198         if (dap->da_newinum != ip->i_number)
9199                 panic("newdirrem: inum %ju should be %ju",
9200                     (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9201         /*
9202          * If we are deleting a changed name that never made it to disk,
9203          * then return the dirrem describing the previous inode (which
9204          * represents the inode currently referenced from this entry on disk).
9205          */
9206         if ((dap->da_state & DIRCHG) != 0) {
9207                 *prevdirremp = dap->da_previous;
9208                 dap->da_state &= ~DIRCHG;
9209                 dap->da_pagedep = pagedep;
9210         }
9211         /*
9212          * We are deleting an entry that never made it to disk.
9213          * Mark it COMPLETE so we can delete its inode immediately.
9214          */
9215         dirrem->dm_state |= COMPLETE;
9216         cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9217 #ifdef SUJ_DEBUG
9218         if (isrmdir == 0) {
9219                 struct worklist *wk;
9220
9221                 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9222                         if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9223                                 panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9224         }
9225 #endif
9226
9227         return (dirrem);
9228 }
9229
9230 /*
9231  * Directory entry change dependencies.
9232  *
9233  * Changing an existing directory entry requires that an add operation
9234  * be completed first followed by a deletion. The semantics for the addition
9235  * are identical to the description of adding a new entry above except
9236  * that the rollback is to the old inode number rather than zero. Once
9237  * the addition dependency is completed, the removal is done as described
9238  * in the removal routine above.
9239  */
9240
9241 /*
9242  * This routine should be called immediately after changing
9243  * a directory entry.  The inode's link count should not be
9244  * decremented by the calling procedure -- the soft updates
9245  * code will perform this task when it is safe.
9246  */
9247 void
9248 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9249         struct buf *bp;         /* buffer containing directory block */
9250         struct inode *dp;       /* inode for the directory being modified */
9251         struct inode *ip;       /* inode for directory entry being removed */
9252         ino_t newinum;          /* new inode number for changed entry */
9253         int isrmdir;            /* indicates if doing RMDIR */
9254 {
9255         int offset;
9256         struct diradd *dap = NULL;
9257         struct dirrem *dirrem, *prevdirrem;
9258         struct pagedep *pagedep;
9259         struct inodedep *inodedep;
9260         struct jaddref *jaddref;
9261         struct mount *mp;
9262
9263         offset = blkoff(dp->i_fs, dp->i_offset);
9264         mp = UFSTOVFS(dp->i_ump);
9265         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9266            ("softdep_setup_directory_change called on non-softdep filesystem"));
9267
9268         /*
9269          * Whiteouts do not need diradd dependencies.
9270          */
9271         if (newinum != WINO) {
9272                 dap = malloc(sizeof(struct diradd),
9273                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9274                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
9275                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9276                 dap->da_offset = offset;
9277                 dap->da_newinum = newinum;
9278                 LIST_INIT(&dap->da_jwork);
9279         }
9280
9281         /*
9282          * Allocate a new dirrem and ACQUIRE_LOCK.
9283          */
9284         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9285         pagedep = dirrem->dm_pagedep;
9286         /*
9287          * The possible values for isrmdir:
9288          *      0 - non-directory file rename
9289          *      1 - directory rename within same directory
9290          *   inum - directory rename to new directory of given inode number
9291          * When renaming to a new directory, we are both deleting and
9292          * creating a new directory entry, so the link count on the new
9293          * directory should not change. Thus we do not need the followup
9294          * dirrem which is usually done in handle_workitem_remove. We set
9295          * the DIRCHG flag to tell handle_workitem_remove to skip the
9296          * followup dirrem.
9297          */
9298         if (isrmdir > 1)
9299                 dirrem->dm_state |= DIRCHG;
9300
9301         /*
9302          * Whiteouts have no additional dependencies,
9303          * so just put the dirrem on the correct list.
9304          */
9305         if (newinum == WINO) {
9306                 if ((dirrem->dm_state & COMPLETE) == 0) {
9307                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9308                             dm_next);
9309                 } else {
9310                         dirrem->dm_dirinum = pagedep->pd_ino;
9311                         if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9312                                 add_to_worklist(&dirrem->dm_list, 0);
9313                 }
9314                 FREE_LOCK(dp->i_ump);
9315                 return;
9316         }
9317         /*
9318          * Add the dirrem to the inodedep's pending remove list for quick
9319          * discovery later.  A valid nlinkdelta ensures that this lookup
9320          * will not fail.
9321          */
9322         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9323                 panic("softdep_setup_directory_change: Lost inodedep.");
9324         dirrem->dm_state |= ONDEPLIST;
9325         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9326
9327         /*
9328          * If the COMPLETE flag is clear, then there were no active
9329          * entries and we want to roll back to the previous inode until
9330          * the new inode is committed to disk. If the COMPLETE flag is
9331          * set, then we have deleted an entry that never made it to disk.
9332          * If the entry we deleted resulted from a name change, then the old
9333          * inode reference still resides on disk. Any rollback that we do
9334          * needs to be to that old inode (returned to us in prevdirrem). If
9335          * the entry we deleted resulted from a create, then there is
9336          * no entry on the disk, so we want to roll back to zero rather
9337          * than the uncommitted inode. In either of the COMPLETE cases we
9338          * want to immediately free the unwritten and unreferenced inode.
9339          */
9340         if ((dirrem->dm_state & COMPLETE) == 0) {
9341                 dap->da_previous = dirrem;
9342         } else {
9343                 if (prevdirrem != NULL) {
9344                         dap->da_previous = prevdirrem;
9345                 } else {
9346                         dap->da_state &= ~DIRCHG;
9347                         dap->da_pagedep = pagedep;
9348                 }
9349                 dirrem->dm_dirinum = pagedep->pd_ino;
9350                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9351                         add_to_worklist(&dirrem->dm_list, 0);
9352         }
9353         /*
9354          * Lookup the jaddref for this journal entry.  We must finish
9355          * initializing it and make the diradd write dependent on it.
9356          * If we're not journaling, put it on the id_bufwait list if the
9357          * inode is not yet written. If it is written, do the post-inode
9358          * write processing to put it on the id_pendinghd list.
9359          */
9360         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9361         if (MOUNTEDSUJ(mp)) {
9362                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9363                     inoreflst);
9364                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9365                     ("softdep_setup_directory_change: bad jaddref %p",
9366                     jaddref));
9367                 jaddref->ja_diroff = dp->i_offset;
9368                 jaddref->ja_diradd = dap;
9369                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9370                     dap, da_pdlist);
9371                 add_to_journal(&jaddref->ja_list);
9372         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9373                 dap->da_state |= COMPLETE;
9374                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9375                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9376         } else {
9377                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9378                     dap, da_pdlist);
9379                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9380         }
9381         /*
9382          * If we're making a new name for a directory that has not been
9383          * committed when need to move the dot and dotdot references to
9384          * this new name.
9385          */
9386         if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9387                 merge_diradd(inodedep, dap);
9388         FREE_LOCK(dp->i_ump);
9389 }
9390
9391 /*
9392  * Called whenever the link count on an inode is changed.
9393  * It creates an inode dependency so that the new reference(s)
9394  * to the inode cannot be committed to disk until the updated
9395  * inode has been written.
9396  */
9397 void
9398 softdep_change_linkcnt(ip)
9399         struct inode *ip;       /* the inode with the increased link count */
9400 {
9401         struct inodedep *inodedep;
9402
9403         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
9404             ("softdep_change_linkcnt called on non-softdep filesystem"));
9405         ACQUIRE_LOCK(ip->i_ump);
9406         inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
9407             &inodedep);
9408         if (ip->i_nlink < ip->i_effnlink)
9409                 panic("softdep_change_linkcnt: bad delta");
9410         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9411         FREE_LOCK(ip->i_ump);
9412 }
9413
9414 /*
9415  * Attach a sbdep dependency to the superblock buf so that we can keep
9416  * track of the head of the linked list of referenced but unlinked inodes.
9417  */
9418 void
9419 softdep_setup_sbupdate(ump, fs, bp)
9420         struct ufsmount *ump;
9421         struct fs *fs;
9422         struct buf *bp;
9423 {
9424         struct sbdep *sbdep;
9425         struct worklist *wk;
9426
9427         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9428             ("softdep_setup_sbupdate called on non-softdep filesystem"));
9429         LIST_FOREACH(wk, &bp->b_dep, wk_list)
9430                 if (wk->wk_type == D_SBDEP)
9431                         break;
9432         if (wk != NULL)
9433                 return;
9434         sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9435         workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9436         sbdep->sb_fs = fs;
9437         sbdep->sb_ump = ump;
9438         ACQUIRE_LOCK(ump);
9439         WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9440         FREE_LOCK(ump);
9441 }
9442
9443 /*
9444  * Return the first unlinked inodedep which is ready to be the head of the
9445  * list.  The inodedep and all those after it must have valid next pointers.
9446  */
9447 static struct inodedep *
9448 first_unlinked_inodedep(ump)
9449         struct ufsmount *ump;
9450 {
9451         struct inodedep *inodedep;
9452         struct inodedep *idp;
9453
9454         LOCK_OWNED(ump);
9455         for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9456             inodedep; inodedep = idp) {
9457                 if ((inodedep->id_state & UNLINKNEXT) == 0)
9458                         return (NULL);
9459                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9460                 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9461                         break;
9462                 if ((inodedep->id_state & UNLINKPREV) == 0)
9463                         break;
9464         }
9465         return (inodedep);
9466 }
9467
9468 /*
9469  * Set the sujfree unlinked head pointer prior to writing a superblock.
9470  */
9471 static void
9472 initiate_write_sbdep(sbdep)
9473         struct sbdep *sbdep;
9474 {
9475         struct inodedep *inodedep;
9476         struct fs *bpfs;
9477         struct fs *fs;
9478
9479         bpfs = sbdep->sb_fs;
9480         fs = sbdep->sb_ump->um_fs;
9481         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9482         if (inodedep) {
9483                 fs->fs_sujfree = inodedep->id_ino;
9484                 inodedep->id_state |= UNLINKPREV;
9485         } else
9486                 fs->fs_sujfree = 0;
9487         bpfs->fs_sujfree = fs->fs_sujfree;
9488 }
9489
9490 /*
9491  * After a superblock is written determine whether it must be written again
9492  * due to a changing unlinked list head.
9493  */
9494 static int
9495 handle_written_sbdep(sbdep, bp)
9496         struct sbdep *sbdep;
9497         struct buf *bp;
9498 {
9499         struct inodedep *inodedep;
9500         struct fs *fs;
9501
9502         LOCK_OWNED(sbdep->sb_ump);
9503         fs = sbdep->sb_fs;
9504         /*
9505          * If the superblock doesn't match the in-memory list start over.
9506          */
9507         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9508         if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9509             (inodedep == NULL && fs->fs_sujfree != 0)) {
9510                 bdirty(bp);
9511                 return (1);
9512         }
9513         WORKITEM_FREE(sbdep, D_SBDEP);
9514         if (fs->fs_sujfree == 0)
9515                 return (0);
9516         /*
9517          * Now that we have a record of this inode in stable store allow it
9518          * to be written to free up pending work.  Inodes may see a lot of
9519          * write activity after they are unlinked which we must not hold up.
9520          */
9521         for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9522                 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9523                         panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9524                             inodedep, inodedep->id_state);
9525                 if (inodedep->id_state & UNLINKONLIST)
9526                         break;
9527                 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9528         }
9529
9530         return (0);
9531 }
9532
9533 /*
9534  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9535  */
9536 static void
9537 unlinked_inodedep(mp, inodedep)
9538         struct mount *mp;
9539         struct inodedep *inodedep;
9540 {
9541         struct ufsmount *ump;
9542
9543         ump = VFSTOUFS(mp);
9544         LOCK_OWNED(ump);
9545         if (MOUNTEDSUJ(mp) == 0)
9546                 return;
9547         ump->um_fs->fs_fmod = 1;
9548         if (inodedep->id_state & UNLINKED)
9549                 panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9550         inodedep->id_state |= UNLINKED;
9551         TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9552 }
9553
9554 /*
9555  * Remove an inodedep from the unlinked inodedep list.  This may require
9556  * disk writes if the inode has made it that far.
9557  */
9558 static void
9559 clear_unlinked_inodedep(inodedep)
9560         struct inodedep *inodedep;
9561 {
9562         struct ufsmount *ump;
9563         struct inodedep *idp;
9564         struct inodedep *idn;
9565         struct fs *fs;
9566         struct buf *bp;
9567         ino_t ino;
9568         ino_t nino;
9569         ino_t pino;
9570         int error;
9571
9572         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9573         fs = ump->um_fs;
9574         ino = inodedep->id_ino;
9575         error = 0;
9576         for (;;) {
9577                 LOCK_OWNED(ump);
9578                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9579                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9580                     inodedep));
9581                 /*
9582                  * If nothing has yet been written simply remove us from
9583                  * the in memory list and return.  This is the most common
9584                  * case where handle_workitem_remove() loses the final
9585                  * reference.
9586                  */
9587                 if ((inodedep->id_state & UNLINKLINKS) == 0)
9588                         break;
9589                 /*
9590                  * If we have a NEXT pointer and no PREV pointer we can simply
9591                  * clear NEXT's PREV and remove ourselves from the list.  Be
9592                  * careful not to clear PREV if the superblock points at
9593                  * next as well.
9594                  */
9595                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9596                 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9597                         if (idn && fs->fs_sujfree != idn->id_ino)
9598                                 idn->id_state &= ~UNLINKPREV;
9599                         break;
9600                 }
9601                 /*
9602                  * Here we have an inodedep which is actually linked into
9603                  * the list.  We must remove it by forcing a write to the
9604                  * link before us, whether it be the superblock or an inode.
9605                  * Unfortunately the list may change while we're waiting
9606                  * on the buf lock for either resource so we must loop until
9607                  * we lock the right one.  If both the superblock and an
9608                  * inode point to this inode we must clear the inode first
9609                  * followed by the superblock.
9610                  */
9611                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9612                 pino = 0;
9613                 if (idp && (idp->id_state & UNLINKNEXT))
9614                         pino = idp->id_ino;
9615                 FREE_LOCK(ump);
9616                 if (pino == 0) {
9617                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9618                             (int)fs->fs_sbsize, 0, 0, 0);
9619                 } else {
9620                         error = bread(ump->um_devvp,
9621                             fsbtodb(fs, ino_to_fsba(fs, pino)),
9622                             (int)fs->fs_bsize, NOCRED, &bp);
9623                         if (error)
9624                                 brelse(bp);
9625                 }
9626                 ACQUIRE_LOCK(ump);
9627                 if (error)
9628                         break;
9629                 /* If the list has changed restart the loop. */
9630                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9631                 nino = 0;
9632                 if (idp && (idp->id_state & UNLINKNEXT))
9633                         nino = idp->id_ino;
9634                 if (nino != pino ||
9635                     (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9636                         FREE_LOCK(ump);
9637                         brelse(bp);
9638                         ACQUIRE_LOCK(ump);
9639                         continue;
9640                 }
9641                 nino = 0;
9642                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9643                 if (idn)
9644                         nino = idn->id_ino;
9645                 /*
9646                  * Remove us from the in memory list.  After this we cannot
9647                  * access the inodedep.
9648                  */
9649                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9650                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9651                     inodedep));
9652                 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9653                 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9654                 FREE_LOCK(ump);
9655                 /*
9656                  * The predecessor's next pointer is manually updated here
9657                  * so that the NEXT flag is never cleared for an element
9658                  * that is in the list.
9659                  */
9660                 if (pino == 0) {
9661                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9662                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9663                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9664                             bp);
9665                 } else if (fs->fs_magic == FS_UFS1_MAGIC)
9666                         ((struct ufs1_dinode *)bp->b_data +
9667                             ino_to_fsbo(fs, pino))->di_freelink = nino;
9668                 else
9669                         ((struct ufs2_dinode *)bp->b_data +
9670                             ino_to_fsbo(fs, pino))->di_freelink = nino;
9671                 /*
9672                  * If the bwrite fails we have no recourse to recover.  The
9673                  * filesystem is corrupted already.
9674                  */
9675                 bwrite(bp);
9676                 ACQUIRE_LOCK(ump);
9677                 /*
9678                  * If the superblock pointer still needs to be cleared force
9679                  * a write here.
9680                  */
9681                 if (fs->fs_sujfree == ino) {
9682                         FREE_LOCK(ump);
9683                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9684                             (int)fs->fs_sbsize, 0, 0, 0);
9685                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9686                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9687                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9688                             bp);
9689                         bwrite(bp);
9690                         ACQUIRE_LOCK(ump);
9691                 }
9692
9693                 if (fs->fs_sujfree != ino)
9694                         return;
9695                 panic("clear_unlinked_inodedep: Failed to clear free head");
9696         }
9697         if (inodedep->id_ino == fs->fs_sujfree)
9698                 panic("clear_unlinked_inodedep: Freeing head of free list");
9699         inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9700         TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9701         return;
9702 }
9703
9704 /*
9705  * This workitem decrements the inode's link count.
9706  * If the link count reaches zero, the file is removed.
9707  */
9708 static int
9709 handle_workitem_remove(dirrem, flags)
9710         struct dirrem *dirrem;
9711         int flags;
9712 {
9713         struct inodedep *inodedep;
9714         struct workhead dotdotwk;
9715         struct worklist *wk;
9716         struct ufsmount *ump;
9717         struct mount *mp;
9718         struct vnode *vp;
9719         struct inode *ip;
9720         ino_t oldinum;
9721
9722         if (dirrem->dm_state & ONWORKLIST)
9723                 panic("handle_workitem_remove: dirrem %p still on worklist",
9724                     dirrem);
9725         oldinum = dirrem->dm_oldinum;
9726         mp = dirrem->dm_list.wk_mp;
9727         ump = VFSTOUFS(mp);
9728         flags |= LK_EXCLUSIVE;
9729         if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9730                 return (EBUSY);
9731         ip = VTOI(vp);
9732         ACQUIRE_LOCK(ump);
9733         if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9734                 panic("handle_workitem_remove: lost inodedep");
9735         if (dirrem->dm_state & ONDEPLIST)
9736                 LIST_REMOVE(dirrem, dm_inonext);
9737         KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9738             ("handle_workitem_remove:  Journal entries not written."));
9739
9740         /*
9741          * Move all dependencies waiting on the remove to complete
9742          * from the dirrem to the inode inowait list to be completed
9743          * after the inode has been updated and written to disk.  Any
9744          * marked MKDIR_PARENT are saved to be completed when the .. ref
9745          * is removed.
9746          */
9747         LIST_INIT(&dotdotwk);
9748         while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9749                 WORKLIST_REMOVE(wk);
9750                 if (wk->wk_state & MKDIR_PARENT) {
9751                         wk->wk_state &= ~MKDIR_PARENT;
9752                         WORKLIST_INSERT(&dotdotwk, wk);
9753                         continue;
9754                 }
9755                 WORKLIST_INSERT(&inodedep->id_inowait, wk);
9756         }
9757         LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9758         /*
9759          * Normal file deletion.
9760          */
9761         if ((dirrem->dm_state & RMDIR) == 0) {
9762                 ip->i_nlink--;
9763                 DIP_SET(ip, i_nlink, ip->i_nlink);
9764                 ip->i_flag |= IN_CHANGE;
9765                 if (ip->i_nlink < ip->i_effnlink)
9766                         panic("handle_workitem_remove: bad file delta");
9767                 if (ip->i_nlink == 0)
9768                         unlinked_inodedep(mp, inodedep);
9769                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9770                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9771                     ("handle_workitem_remove: worklist not empty. %s",
9772                     TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9773                 WORKITEM_FREE(dirrem, D_DIRREM);
9774                 FREE_LOCK(ump);
9775                 goto out;
9776         }
9777         /*
9778          * Directory deletion. Decrement reference count for both the
9779          * just deleted parent directory entry and the reference for ".".
9780          * Arrange to have the reference count on the parent decremented
9781          * to account for the loss of "..".
9782          */
9783         ip->i_nlink -= 2;
9784         DIP_SET(ip, i_nlink, ip->i_nlink);
9785         ip->i_flag |= IN_CHANGE;
9786         if (ip->i_nlink < ip->i_effnlink)
9787                 panic("handle_workitem_remove: bad dir delta");
9788         if (ip->i_nlink == 0)
9789                 unlinked_inodedep(mp, inodedep);
9790         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9791         /*
9792          * Rename a directory to a new parent. Since, we are both deleting
9793          * and creating a new directory entry, the link count on the new
9794          * directory should not change. Thus we skip the followup dirrem.
9795          */
9796         if (dirrem->dm_state & DIRCHG) {
9797                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9798                     ("handle_workitem_remove: DIRCHG and worklist not empty."));
9799                 WORKITEM_FREE(dirrem, D_DIRREM);
9800                 FREE_LOCK(ump);
9801                 goto out;
9802         }
9803         dirrem->dm_state = ONDEPLIST;
9804         dirrem->dm_oldinum = dirrem->dm_dirinum;
9805         /*
9806          * Place the dirrem on the parent's diremhd list.
9807          */
9808         if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9809                 panic("handle_workitem_remove: lost dir inodedep");
9810         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9811         /*
9812          * If the allocated inode has never been written to disk, then
9813          * the on-disk inode is zero'ed and we can remove the file
9814          * immediately.  When journaling if the inode has been marked
9815          * unlinked and not DEPCOMPLETE we know it can never be written.
9816          */
9817         inodedep_lookup(mp, oldinum, 0, &inodedep);
9818         if (inodedep == NULL ||
9819             (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9820             check_inode_unwritten(inodedep)) {
9821                 FREE_LOCK(ump);
9822                 vput(vp);
9823                 return handle_workitem_remove(dirrem, flags);
9824         }
9825         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9826         FREE_LOCK(ump);
9827         ip->i_flag |= IN_CHANGE;
9828 out:
9829         ffs_update(vp, 0);
9830         vput(vp);
9831         return (0);
9832 }
9833
9834 /*
9835  * Inode de-allocation dependencies.
9836  *
9837  * When an inode's link count is reduced to zero, it can be de-allocated. We
9838  * found it convenient to postpone de-allocation until after the inode is
9839  * written to disk with its new link count (zero).  At this point, all of the
9840  * on-disk inode's block pointers are nullified and, with careful dependency
9841  * list ordering, all dependencies related to the inode will be satisfied and
9842  * the corresponding dependency structures de-allocated.  So, if/when the
9843  * inode is reused, there will be no mixing of old dependencies with new
9844  * ones.  This artificial dependency is set up by the block de-allocation
9845  * procedure above (softdep_setup_freeblocks) and completed by the
9846  * following procedure.
9847  */
9848 static void
9849 handle_workitem_freefile(freefile)
9850         struct freefile *freefile;
9851 {
9852         struct workhead wkhd;
9853         struct fs *fs;
9854         struct inodedep *idp;
9855         struct ufsmount *ump;
9856         int error;
9857
9858         ump = VFSTOUFS(freefile->fx_list.wk_mp);
9859         fs = ump->um_fs;
9860 #ifdef DEBUG
9861         ACQUIRE_LOCK(ump);
9862         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9863         FREE_LOCK(ump);
9864         if (error)
9865                 panic("handle_workitem_freefile: inodedep %p survived", idp);
9866 #endif
9867         UFS_LOCK(ump);
9868         fs->fs_pendinginodes -= 1;
9869         UFS_UNLOCK(ump);
9870         LIST_INIT(&wkhd);
9871         LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9872         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9873             freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9874                 softdep_error("handle_workitem_freefile", error);
9875         ACQUIRE_LOCK(ump);
9876         WORKITEM_FREE(freefile, D_FREEFILE);
9877         FREE_LOCK(ump);
9878 }
9879
9880
9881 /*
9882  * Helper function which unlinks marker element from work list and returns
9883  * the next element on the list.
9884  */
9885 static __inline struct worklist *
9886 markernext(struct worklist *marker)
9887 {
9888         struct worklist *next;
9889
9890         next = LIST_NEXT(marker, wk_list);
9891         LIST_REMOVE(marker, wk_list);
9892         return next;
9893 }
9894
9895 /*
9896  * Disk writes.
9897  *
9898  * The dependency structures constructed above are most actively used when file
9899  * system blocks are written to disk.  No constraints are placed on when a
9900  * block can be written, but unsatisfied update dependencies are made safe by
9901  * modifying (or replacing) the source memory for the duration of the disk
9902  * write.  When the disk write completes, the memory block is again brought
9903  * up-to-date.
9904  *
9905  * In-core inode structure reclamation.
9906  *
9907  * Because there are a finite number of "in-core" inode structures, they are
9908  * reused regularly.  By transferring all inode-related dependencies to the
9909  * in-memory inode block and indexing them separately (via "inodedep"s), we
9910  * can allow "in-core" inode structures to be reused at any time and avoid
9911  * any increase in contention.
9912  *
9913  * Called just before entering the device driver to initiate a new disk I/O.
9914  * The buffer must be locked, thus, no I/O completion operations can occur
9915  * while we are manipulating its associated dependencies.
9916  */
9917 static void
9918 softdep_disk_io_initiation(bp)
9919         struct buf *bp;         /* structure describing disk write to occur */
9920 {
9921         struct worklist *wk;
9922         struct worklist marker;
9923         struct inodedep *inodedep;
9924         struct freeblks *freeblks;
9925         struct jblkdep *jblkdep;
9926         struct newblk *newblk;
9927         struct ufsmount *ump;
9928
9929         /*
9930          * We only care about write operations. There should never
9931          * be dependencies for reads.
9932          */
9933         if (bp->b_iocmd != BIO_WRITE)
9934                 panic("softdep_disk_io_initiation: not write");
9935
9936         if (bp->b_vflags & BV_BKGRDINPROG)
9937                 panic("softdep_disk_io_initiation: Writing buffer with "
9938                     "background write in progress: %p", bp);
9939
9940         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
9941                 return;
9942         ump = VFSTOUFS(wk->wk_mp);
9943
9944         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
9945         PHOLD(curproc);                 /* Don't swap out kernel stack */
9946         ACQUIRE_LOCK(ump);
9947         /*
9948          * Do any necessary pre-I/O processing.
9949          */
9950         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9951              wk = markernext(&marker)) {
9952                 LIST_INSERT_AFTER(wk, &marker, wk_list);
9953                 switch (wk->wk_type) {
9954
9955                 case D_PAGEDEP:
9956                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
9957                         continue;
9958
9959                 case D_INODEDEP:
9960                         inodedep = WK_INODEDEP(wk);
9961                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9962                                 initiate_write_inodeblock_ufs1(inodedep, bp);
9963                         else
9964                                 initiate_write_inodeblock_ufs2(inodedep, bp);
9965                         continue;
9966
9967                 case D_INDIRDEP:
9968                         initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9969                         continue;
9970
9971                 case D_BMSAFEMAP:
9972                         initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9973                         continue;
9974
9975                 case D_JSEG:
9976                         WK_JSEG(wk)->js_buf = NULL;
9977                         continue;
9978
9979                 case D_FREEBLKS:
9980                         freeblks = WK_FREEBLKS(wk);
9981                         jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9982                         /*
9983                          * We have to wait for the freeblks to be journaled
9984                          * before we can write an inodeblock with updated
9985                          * pointers.  Be careful to arrange the marker so
9986                          * we revisit the freeblks if it's not removed by
9987                          * the first jwait().
9988                          */
9989                         if (jblkdep != NULL) {
9990                                 LIST_REMOVE(&marker, wk_list);
9991                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
9992                                 jwait(&jblkdep->jb_list, MNT_WAIT);
9993                         }
9994                         continue;
9995                 case D_ALLOCDIRECT:
9996                 case D_ALLOCINDIR:
9997                         /*
9998                          * We have to wait for the jnewblk to be journaled
9999                          * before we can write to a block if the contents
10000                          * may be confused with an earlier file's indirect
10001                          * at recovery time.  Handle the marker as described
10002                          * above.
10003                          */
10004                         newblk = WK_NEWBLK(wk);
10005                         if (newblk->nb_jnewblk != NULL &&
10006                             indirblk_lookup(newblk->nb_list.wk_mp,
10007                             newblk->nb_newblkno)) {
10008                                 LIST_REMOVE(&marker, wk_list);
10009                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10010                                 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10011                         }
10012                         continue;
10013
10014                 case D_SBDEP:
10015                         initiate_write_sbdep(WK_SBDEP(wk));
10016                         continue;
10017
10018                 case D_MKDIR:
10019                 case D_FREEWORK:
10020                 case D_FREEDEP:
10021                 case D_JSEGDEP:
10022                         continue;
10023
10024                 default:
10025                         panic("handle_disk_io_initiation: Unexpected type %s",
10026                             TYPENAME(wk->wk_type));
10027                         /* NOTREACHED */
10028                 }
10029         }
10030         FREE_LOCK(ump);
10031         PRELE(curproc);                 /* Allow swapout of kernel stack */
10032 }
10033
10034 /*
10035  * Called from within the procedure above to deal with unsatisfied
10036  * allocation dependencies in a directory. The buffer must be locked,
10037  * thus, no I/O completion operations can occur while we are
10038  * manipulating its associated dependencies.
10039  */
10040 static void
10041 initiate_write_filepage(pagedep, bp)
10042         struct pagedep *pagedep;
10043         struct buf *bp;
10044 {
10045         struct jremref *jremref;
10046         struct jmvref *jmvref;
10047         struct dirrem *dirrem;
10048         struct diradd *dap;
10049         struct direct *ep;
10050         int i;
10051
10052         if (pagedep->pd_state & IOSTARTED) {
10053                 /*
10054                  * This can only happen if there is a driver that does not
10055                  * understand chaining. Here biodone will reissue the call
10056                  * to strategy for the incomplete buffers.
10057                  */
10058                 printf("initiate_write_filepage: already started\n");
10059                 return;
10060         }
10061         pagedep->pd_state |= IOSTARTED;
10062         /*
10063          * Wait for all journal remove dependencies to hit the disk.
10064          * We can not allow any potentially conflicting directory adds
10065          * to be visible before removes and rollback is too difficult.
10066          * The per-filesystem lock may be dropped and re-acquired, however
10067          * we hold the buf locked so the dependency can not go away.
10068          */
10069         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10070                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10071                         jwait(&jremref->jr_list, MNT_WAIT);
10072         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10073                 jwait(&jmvref->jm_list, MNT_WAIT);
10074         for (i = 0; i < DAHASHSZ; i++) {
10075                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10076                         ep = (struct direct *)
10077                             ((char *)bp->b_data + dap->da_offset);
10078                         if (ep->d_ino != dap->da_newinum)
10079                                 panic("%s: dir inum %ju != new %ju",
10080                                     "initiate_write_filepage",
10081                                     (uintmax_t)ep->d_ino,
10082                                     (uintmax_t)dap->da_newinum);
10083                         if (dap->da_state & DIRCHG)
10084                                 ep->d_ino = dap->da_previous->dm_oldinum;
10085                         else
10086                                 ep->d_ino = 0;
10087                         dap->da_state &= ~ATTACHED;
10088                         dap->da_state |= UNDONE;
10089                 }
10090         }
10091 }
10092
10093 /*
10094  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10095  * Note that any bug fixes made to this routine must be done in the
10096  * version found below.
10097  *
10098  * Called from within the procedure above to deal with unsatisfied
10099  * allocation dependencies in an inodeblock. The buffer must be
10100  * locked, thus, no I/O completion operations can occur while we
10101  * are manipulating its associated dependencies.
10102  */
10103 static void
10104 initiate_write_inodeblock_ufs1(inodedep, bp)
10105         struct inodedep *inodedep;
10106         struct buf *bp;                 /* The inode block */
10107 {
10108         struct allocdirect *adp, *lastadp;
10109         struct ufs1_dinode *dp;
10110         struct ufs1_dinode *sip;
10111         struct inoref *inoref;
10112         struct ufsmount *ump;
10113         struct fs *fs;
10114         ufs_lbn_t i;
10115 #ifdef INVARIANTS
10116         ufs_lbn_t prevlbn = 0;
10117 #endif
10118         int deplist;
10119
10120         if (inodedep->id_state & IOSTARTED)
10121                 panic("initiate_write_inodeblock_ufs1: already started");
10122         inodedep->id_state |= IOSTARTED;
10123         fs = inodedep->id_fs;
10124         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10125         LOCK_OWNED(ump);
10126         dp = (struct ufs1_dinode *)bp->b_data +
10127             ino_to_fsbo(fs, inodedep->id_ino);
10128
10129         /*
10130          * If we're on the unlinked list but have not yet written our
10131          * next pointer initialize it here.
10132          */
10133         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10134                 struct inodedep *inon;
10135
10136                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10137                 dp->di_freelink = inon ? inon->id_ino : 0;
10138         }
10139         /*
10140          * If the bitmap is not yet written, then the allocated
10141          * inode cannot be written to disk.
10142          */
10143         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10144                 if (inodedep->id_savedino1 != NULL)
10145                         panic("initiate_write_inodeblock_ufs1: I/O underway");
10146                 FREE_LOCK(ump);
10147                 sip = malloc(sizeof(struct ufs1_dinode),
10148                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10149                 ACQUIRE_LOCK(ump);
10150                 inodedep->id_savedino1 = sip;
10151                 *inodedep->id_savedino1 = *dp;
10152                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10153                 dp->di_gen = inodedep->id_savedino1->di_gen;
10154                 dp->di_freelink = inodedep->id_savedino1->di_freelink;
10155                 return;
10156         }
10157         /*
10158          * If no dependencies, then there is nothing to roll back.
10159          */
10160         inodedep->id_savedsize = dp->di_size;
10161         inodedep->id_savedextsize = 0;
10162         inodedep->id_savednlink = dp->di_nlink;
10163         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10164             TAILQ_EMPTY(&inodedep->id_inoreflst))
10165                 return;
10166         /*
10167          * Revert the link count to that of the first unwritten journal entry.
10168          */
10169         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10170         if (inoref)
10171                 dp->di_nlink = inoref->if_nlink;
10172         /*
10173          * Set the dependencies to busy.
10174          */
10175         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10176              adp = TAILQ_NEXT(adp, ad_next)) {
10177 #ifdef INVARIANTS
10178                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10179                         panic("softdep_write_inodeblock: lbn order");
10180                 prevlbn = adp->ad_offset;
10181                 if (adp->ad_offset < NDADDR &&
10182                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10183                         panic("%s: direct pointer #%jd mismatch %d != %jd",
10184                             "softdep_write_inodeblock",
10185                             (intmax_t)adp->ad_offset,
10186                             dp->di_db[adp->ad_offset],
10187                             (intmax_t)adp->ad_newblkno);
10188                 if (adp->ad_offset >= NDADDR &&
10189                     dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10190                         panic("%s: indirect pointer #%jd mismatch %d != %jd",
10191                             "softdep_write_inodeblock",
10192                             (intmax_t)adp->ad_offset - NDADDR,
10193                             dp->di_ib[adp->ad_offset - NDADDR],
10194                             (intmax_t)adp->ad_newblkno);
10195                 deplist |= 1 << adp->ad_offset;
10196                 if ((adp->ad_state & ATTACHED) == 0)
10197                         panic("softdep_write_inodeblock: Unknown state 0x%x",
10198                             adp->ad_state);
10199 #endif /* INVARIANTS */
10200                 adp->ad_state &= ~ATTACHED;
10201                 adp->ad_state |= UNDONE;
10202         }
10203         /*
10204          * The on-disk inode cannot claim to be any larger than the last
10205          * fragment that has been written. Otherwise, the on-disk inode
10206          * might have fragments that were not the last block in the file
10207          * which would corrupt the filesystem.
10208          */
10209         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10210              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10211                 if (adp->ad_offset >= NDADDR)
10212                         break;
10213                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10214                 /* keep going until hitting a rollback to a frag */
10215                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10216                         continue;
10217                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10218                 for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10219 #ifdef INVARIANTS
10220                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10221                                 panic("softdep_write_inodeblock: lost dep1");
10222 #endif /* INVARIANTS */
10223                         dp->di_db[i] = 0;
10224                 }
10225                 for (i = 0; i < NIADDR; i++) {
10226 #ifdef INVARIANTS
10227                         if (dp->di_ib[i] != 0 &&
10228                             (deplist & ((1 << NDADDR) << i)) == 0)
10229                                 panic("softdep_write_inodeblock: lost dep2");
10230 #endif /* INVARIANTS */
10231                         dp->di_ib[i] = 0;
10232                 }
10233                 return;
10234         }
10235         /*
10236          * If we have zero'ed out the last allocated block of the file,
10237          * roll back the size to the last currently allocated block.
10238          * We know that this last allocated block is a full-sized as
10239          * we already checked for fragments in the loop above.
10240          */
10241         if (lastadp != NULL &&
10242             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10243                 for (i = lastadp->ad_offset; i >= 0; i--)
10244                         if (dp->di_db[i] != 0)
10245                                 break;
10246                 dp->di_size = (i + 1) * fs->fs_bsize;
10247         }
10248         /*
10249          * The only dependencies are for indirect blocks.
10250          *
10251          * The file size for indirect block additions is not guaranteed.
10252          * Such a guarantee would be non-trivial to achieve. The conventional
10253          * synchronous write implementation also does not make this guarantee.
10254          * Fsck should catch and fix discrepancies. Arguably, the file size
10255          * can be over-estimated without destroying integrity when the file
10256          * moves into the indirect blocks (i.e., is large). If we want to
10257          * postpone fsck, we are stuck with this argument.
10258          */
10259         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10260                 dp->di_ib[adp->ad_offset - NDADDR] = 0;
10261 }
10262
10263 /*
10264  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10265  * Note that any bug fixes made to this routine must be done in the
10266  * version found above.
10267  *
10268  * Called from within the procedure above to deal with unsatisfied
10269  * allocation dependencies in an inodeblock. The buffer must be
10270  * locked, thus, no I/O completion operations can occur while we
10271  * are manipulating its associated dependencies.
10272  */
10273 static void
10274 initiate_write_inodeblock_ufs2(inodedep, bp)
10275         struct inodedep *inodedep;
10276         struct buf *bp;                 /* The inode block */
10277 {
10278         struct allocdirect *adp, *lastadp;
10279         struct ufs2_dinode *dp;
10280         struct ufs2_dinode *sip;
10281         struct inoref *inoref;
10282         struct ufsmount *ump;
10283         struct fs *fs;
10284         ufs_lbn_t i;
10285 #ifdef INVARIANTS
10286         ufs_lbn_t prevlbn = 0;
10287 #endif
10288         int deplist;
10289
10290         if (inodedep->id_state & IOSTARTED)
10291                 panic("initiate_write_inodeblock_ufs2: already started");
10292         inodedep->id_state |= IOSTARTED;
10293         fs = inodedep->id_fs;
10294         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10295         LOCK_OWNED(ump);
10296         dp = (struct ufs2_dinode *)bp->b_data +
10297             ino_to_fsbo(fs, inodedep->id_ino);
10298
10299         /*
10300          * If we're on the unlinked list but have not yet written our
10301          * next pointer initialize it here.
10302          */
10303         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10304                 struct inodedep *inon;
10305
10306                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10307                 dp->di_freelink = inon ? inon->id_ino : 0;
10308         }
10309         /*
10310          * If the bitmap is not yet written, then the allocated
10311          * inode cannot be written to disk.
10312          */
10313         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10314                 if (inodedep->id_savedino2 != NULL)
10315                         panic("initiate_write_inodeblock_ufs2: I/O underway");
10316                 FREE_LOCK(ump);
10317                 sip = malloc(sizeof(struct ufs2_dinode),
10318                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10319                 ACQUIRE_LOCK(ump);
10320                 inodedep->id_savedino2 = sip;
10321                 *inodedep->id_savedino2 = *dp;
10322                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10323                 dp->di_gen = inodedep->id_savedino2->di_gen;
10324                 dp->di_freelink = inodedep->id_savedino2->di_freelink;
10325                 return;
10326         }
10327         /*
10328          * If no dependencies, then there is nothing to roll back.
10329          */
10330         inodedep->id_savedsize = dp->di_size;
10331         inodedep->id_savedextsize = dp->di_extsize;
10332         inodedep->id_savednlink = dp->di_nlink;
10333         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10334             TAILQ_EMPTY(&inodedep->id_extupdt) &&
10335             TAILQ_EMPTY(&inodedep->id_inoreflst))
10336                 return;
10337         /*
10338          * Revert the link count to that of the first unwritten journal entry.
10339          */
10340         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10341         if (inoref)
10342                 dp->di_nlink = inoref->if_nlink;
10343
10344         /*
10345          * Set the ext data dependencies to busy.
10346          */
10347         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10348              adp = TAILQ_NEXT(adp, ad_next)) {
10349 #ifdef INVARIANTS
10350                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10351                         panic("softdep_write_inodeblock: lbn order");
10352                 prevlbn = adp->ad_offset;
10353                 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10354                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
10355                             "softdep_write_inodeblock",
10356                             (intmax_t)adp->ad_offset,
10357                             (intmax_t)dp->di_extb[adp->ad_offset],
10358                             (intmax_t)adp->ad_newblkno);
10359                 deplist |= 1 << adp->ad_offset;
10360                 if ((adp->ad_state & ATTACHED) == 0)
10361                         panic("softdep_write_inodeblock: Unknown state 0x%x",
10362                             adp->ad_state);
10363 #endif /* INVARIANTS */
10364                 adp->ad_state &= ~ATTACHED;
10365                 adp->ad_state |= UNDONE;
10366         }
10367         /*
10368          * The on-disk inode cannot claim to be any larger than the last
10369          * fragment that has been written. Otherwise, the on-disk inode
10370          * might have fragments that were not the last block in the ext
10371          * data which would corrupt the filesystem.
10372          */
10373         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10374              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10375                 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10376                 /* keep going until hitting a rollback to a frag */
10377                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10378                         continue;
10379                 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10380                 for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10381 #ifdef INVARIANTS
10382                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10383                                 panic("softdep_write_inodeblock: lost dep1");
10384 #endif /* INVARIANTS */
10385                         dp->di_extb[i] = 0;
10386                 }
10387                 lastadp = NULL;
10388                 break;
10389         }
10390         /*
10391          * If we have zero'ed out the last allocated block of the ext
10392          * data, roll back the size to the last currently allocated block.
10393          * We know that this last allocated block is a full-sized as
10394          * we already checked for fragments in the loop above.
10395          */
10396         if (lastadp != NULL &&
10397             dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10398                 for (i = lastadp->ad_offset; i >= 0; i--)
10399                         if (dp->di_extb[i] != 0)
10400                                 break;
10401                 dp->di_extsize = (i + 1) * fs->fs_bsize;
10402         }
10403         /*
10404          * Set the file data dependencies to busy.
10405          */
10406         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10407              adp = TAILQ_NEXT(adp, ad_next)) {
10408 #ifdef INVARIANTS
10409                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10410                         panic("softdep_write_inodeblock: lbn order");
10411                 if ((adp->ad_state & ATTACHED) == 0)
10412                         panic("inodedep %p and adp %p not attached", inodedep, adp);
10413                 prevlbn = adp->ad_offset;
10414                 if (adp->ad_offset < NDADDR &&
10415                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10416                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
10417                             "softdep_write_inodeblock",
10418                             (intmax_t)adp->ad_offset,
10419                             (intmax_t)dp->di_db[adp->ad_offset],
10420                             (intmax_t)adp->ad_newblkno);
10421                 if (adp->ad_offset >= NDADDR &&
10422                     dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10423                         panic("%s indirect pointer #%jd mismatch %jd != %jd",
10424                             "softdep_write_inodeblock:",
10425                             (intmax_t)adp->ad_offset - NDADDR,
10426                             (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10427                             (intmax_t)adp->ad_newblkno);
10428                 deplist |= 1 << adp->ad_offset;
10429                 if ((adp->ad_state & ATTACHED) == 0)
10430                         panic("softdep_write_inodeblock: Unknown state 0x%x",
10431                             adp->ad_state);
10432 #endif /* INVARIANTS */
10433                 adp->ad_state &= ~ATTACHED;
10434                 adp->ad_state |= UNDONE;
10435         }
10436         /*
10437          * The on-disk inode cannot claim to be any larger than the last
10438          * fragment that has been written. Otherwise, the on-disk inode
10439          * might have fragments that were not the last block in the file
10440          * which would corrupt the filesystem.
10441          */
10442         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10443              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10444                 if (adp->ad_offset >= NDADDR)
10445                         break;
10446                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10447                 /* keep going until hitting a rollback to a frag */
10448                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10449                         continue;
10450                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10451                 for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10452 #ifdef INVARIANTS
10453                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10454                                 panic("softdep_write_inodeblock: lost dep2");
10455 #endif /* INVARIANTS */
10456                         dp->di_db[i] = 0;
10457                 }
10458                 for (i = 0; i < NIADDR; i++) {
10459 #ifdef INVARIANTS
10460                         if (dp->di_ib[i] != 0 &&
10461                             (deplist & ((1 << NDADDR) << i)) == 0)
10462                                 panic("softdep_write_inodeblock: lost dep3");
10463 #endif /* INVARIANTS */
10464                         dp->di_ib[i] = 0;
10465                 }
10466                 return;
10467         }
10468         /*
10469          * If we have zero'ed out the last allocated block of the file,
10470          * roll back the size to the last currently allocated block.
10471          * We know that this last allocated block is a full-sized as
10472          * we already checked for fragments in the loop above.
10473          */
10474         if (lastadp != NULL &&
10475             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10476                 for (i = lastadp->ad_offset; i >= 0; i--)
10477                         if (dp->di_db[i] != 0)
10478                                 break;
10479                 dp->di_size = (i + 1) * fs->fs_bsize;
10480         }
10481         /*
10482          * The only dependencies are for indirect blocks.
10483          *
10484          * The file size for indirect block additions is not guaranteed.
10485          * Such a guarantee would be non-trivial to achieve. The conventional
10486          * synchronous write implementation also does not make this guarantee.
10487          * Fsck should catch and fix discrepancies. Arguably, the file size
10488          * can be over-estimated without destroying integrity when the file
10489          * moves into the indirect blocks (i.e., is large). If we want to
10490          * postpone fsck, we are stuck with this argument.
10491          */
10492         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10493                 dp->di_ib[adp->ad_offset - NDADDR] = 0;
10494 }
10495
10496 /*
10497  * Cancel an indirdep as a result of truncation.  Release all of the
10498  * children allocindirs and place their journal work on the appropriate
10499  * list.
10500  */
10501 static void
10502 cancel_indirdep(indirdep, bp, freeblks)
10503         struct indirdep *indirdep;
10504         struct buf *bp;
10505         struct freeblks *freeblks;
10506 {
10507         struct allocindir *aip;
10508
10509         /*
10510          * None of the indirect pointers will ever be visible,
10511          * so they can simply be tossed. GOINGAWAY ensures
10512          * that allocated pointers will be saved in the buffer
10513          * cache until they are freed. Note that they will
10514          * only be able to be found by their physical address
10515          * since the inode mapping the logical address will
10516          * be gone. The save buffer used for the safe copy
10517          * was allocated in setup_allocindir_phase2 using
10518          * the physical address so it could be used for this
10519          * purpose. Hence we swap the safe copy with the real
10520          * copy, allowing the safe copy to be freed and holding
10521          * on to the real copy for later use in indir_trunc.
10522          */
10523         if (indirdep->ir_state & GOINGAWAY)
10524                 panic("cancel_indirdep: already gone");
10525         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10526                 indirdep->ir_state |= DEPCOMPLETE;
10527                 LIST_REMOVE(indirdep, ir_next);
10528         }
10529         indirdep->ir_state |= GOINGAWAY;
10530         /*
10531          * Pass in bp for blocks still have journal writes
10532          * pending so we can cancel them on their own.
10533          */
10534         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10535                 cancel_allocindir(aip, bp, freeblks, 0);
10536         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10537                 cancel_allocindir(aip, NULL, freeblks, 0);
10538         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10539                 cancel_allocindir(aip, NULL, freeblks, 0);
10540         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10541                 cancel_allocindir(aip, NULL, freeblks, 0);
10542         /*
10543          * If there are pending partial truncations we need to keep the
10544          * old block copy around until they complete.  This is because
10545          * the current b_data is not a perfect superset of the available
10546          * blocks.
10547          */
10548         if (TAILQ_EMPTY(&indirdep->ir_trunc))
10549                 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10550         else
10551                 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10552         WORKLIST_REMOVE(&indirdep->ir_list);
10553         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10554         indirdep->ir_bp = NULL;
10555         indirdep->ir_freeblks = freeblks;
10556 }
10557
10558 /*
10559  * Free an indirdep once it no longer has new pointers to track.
10560  */
10561 static void
10562 free_indirdep(indirdep)
10563         struct indirdep *indirdep;
10564 {
10565
10566         KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10567             ("free_indirdep: Indir trunc list not empty."));
10568         KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10569             ("free_indirdep: Complete head not empty."));
10570         KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10571             ("free_indirdep: write head not empty."));
10572         KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10573             ("free_indirdep: done head not empty."));
10574         KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10575             ("free_indirdep: deplist head not empty."));
10576         KASSERT((indirdep->ir_state & DEPCOMPLETE),
10577             ("free_indirdep: %p still on newblk list.", indirdep));
10578         KASSERT(indirdep->ir_saveddata == NULL,
10579             ("free_indirdep: %p still has saved data.", indirdep));
10580         if (indirdep->ir_state & ONWORKLIST)
10581                 WORKLIST_REMOVE(&indirdep->ir_list);
10582         WORKITEM_FREE(indirdep, D_INDIRDEP);
10583 }
10584
10585 /*
10586  * Called before a write to an indirdep.  This routine is responsible for
10587  * rolling back pointers to a safe state which includes only those
10588  * allocindirs which have been completed.
10589  */
10590 static void
10591 initiate_write_indirdep(indirdep, bp)
10592         struct indirdep *indirdep;
10593         struct buf *bp;
10594 {
10595         struct ufsmount *ump;
10596
10597         indirdep->ir_state |= IOSTARTED;
10598         if (indirdep->ir_state & GOINGAWAY)
10599                 panic("disk_io_initiation: indirdep gone");
10600         /*
10601          * If there are no remaining dependencies, this will be writing
10602          * the real pointers.
10603          */
10604         if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10605             TAILQ_EMPTY(&indirdep->ir_trunc))
10606                 return;
10607         /*
10608          * Replace up-to-date version with safe version.
10609          */
10610         if (indirdep->ir_saveddata == NULL) {
10611                 ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10612                 LOCK_OWNED(ump);
10613                 FREE_LOCK(ump);
10614                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10615                     M_SOFTDEP_FLAGS);
10616                 ACQUIRE_LOCK(ump);
10617         }
10618         indirdep->ir_state &= ~ATTACHED;
10619         indirdep->ir_state |= UNDONE;
10620         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10621         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10622             bp->b_bcount);
10623 }
10624
10625 /*
10626  * Called when an inode has been cleared in a cg bitmap.  This finally
10627  * eliminates any canceled jaddrefs
10628  */
10629 void
10630 softdep_setup_inofree(mp, bp, ino, wkhd)
10631         struct mount *mp;
10632         struct buf *bp;
10633         ino_t ino;
10634         struct workhead *wkhd;
10635 {
10636         struct worklist *wk, *wkn;
10637         struct inodedep *inodedep;
10638         struct ufsmount *ump;
10639         uint8_t *inosused;
10640         struct cg *cgp;
10641         struct fs *fs;
10642
10643         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10644             ("softdep_setup_inofree called on non-softdep filesystem"));
10645         ump = VFSTOUFS(mp);
10646         ACQUIRE_LOCK(ump);
10647         fs = ump->um_fs;
10648         cgp = (struct cg *)bp->b_data;
10649         inosused = cg_inosused(cgp);
10650         if (isset(inosused, ino % fs->fs_ipg))
10651                 panic("softdep_setup_inofree: inode %ju not freed.",
10652                     (uintmax_t)ino);
10653         if (inodedep_lookup(mp, ino, 0, &inodedep))
10654                 panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10655                     (uintmax_t)ino, inodedep);
10656         if (wkhd) {
10657                 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10658                         if (wk->wk_type != D_JADDREF)
10659                                 continue;
10660                         WORKLIST_REMOVE(wk);
10661                         /*
10662                          * We can free immediately even if the jaddref
10663                          * isn't attached in a background write as now
10664                          * the bitmaps are reconciled.
10665                          */
10666                         wk->wk_state |= COMPLETE | ATTACHED;
10667                         free_jaddref(WK_JADDREF(wk));
10668                 }
10669                 jwork_move(&bp->b_dep, wkhd);
10670         }
10671         FREE_LOCK(ump);
10672 }
10673
10674
10675 /*
10676  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10677  * map.  Any dependencies waiting for the write to clear are added to the
10678  * buf's list and any jnewblks that are being canceled are discarded
10679  * immediately.
10680  */
10681 void
10682 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10683         struct mount *mp;
10684         struct buf *bp;
10685         ufs2_daddr_t blkno;
10686         int frags;
10687         struct workhead *wkhd;
10688 {
10689         struct bmsafemap *bmsafemap;
10690         struct jnewblk *jnewblk;
10691         struct ufsmount *ump;
10692         struct worklist *wk;
10693         struct fs *fs;
10694 #ifdef SUJ_DEBUG
10695         uint8_t *blksfree;
10696         struct cg *cgp;
10697         ufs2_daddr_t jstart;
10698         ufs2_daddr_t jend;
10699         ufs2_daddr_t end;
10700         long bno;
10701         int i;
10702 #endif
10703
10704         CTR3(KTR_SUJ,
10705             "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10706             blkno, frags, wkhd);
10707
10708         ump = VFSTOUFS(mp);
10709         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10710             ("softdep_setup_blkfree called on non-softdep filesystem"));
10711         ACQUIRE_LOCK(ump);
10712         /* Lookup the bmsafemap so we track when it is dirty. */
10713         fs = ump->um_fs;
10714         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10715         /*
10716          * Detach any jnewblks which have been canceled.  They must linger
10717          * until the bitmap is cleared again by ffs_blkfree() to prevent
10718          * an unjournaled allocation from hitting the disk.
10719          */
10720         if (wkhd) {
10721                 while ((wk = LIST_FIRST(wkhd)) != NULL) {
10722                         CTR2(KTR_SUJ,
10723                             "softdep_setup_blkfree: blkno %jd wk type %d",
10724                             blkno, wk->wk_type);
10725                         WORKLIST_REMOVE(wk);
10726                         if (wk->wk_type != D_JNEWBLK) {
10727                                 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10728                                 continue;
10729                         }
10730                         jnewblk = WK_JNEWBLK(wk);
10731                         KASSERT(jnewblk->jn_state & GOINGAWAY,
10732                             ("softdep_setup_blkfree: jnewblk not canceled."));
10733 #ifdef SUJ_DEBUG
10734                         /*
10735                          * Assert that this block is free in the bitmap
10736                          * before we discard the jnewblk.
10737                          */
10738                         cgp = (struct cg *)bp->b_data;
10739                         blksfree = cg_blksfree(cgp);
10740                         bno = dtogd(fs, jnewblk->jn_blkno);
10741                         for (i = jnewblk->jn_oldfrags;
10742                             i < jnewblk->jn_frags; i++) {
10743                                 if (isset(blksfree, bno + i))
10744                                         continue;
10745                                 panic("softdep_setup_blkfree: not free");
10746                         }
10747 #endif
10748                         /*
10749                          * Even if it's not attached we can free immediately
10750                          * as the new bitmap is correct.
10751                          */
10752                         wk->wk_state |= COMPLETE | ATTACHED;
10753                         free_jnewblk(jnewblk);
10754                 }
10755         }
10756
10757 #ifdef SUJ_DEBUG
10758         /*
10759          * Assert that we are not freeing a block which has an outstanding
10760          * allocation dependency.
10761          */
10762         fs = VFSTOUFS(mp)->um_fs;
10763         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10764         end = blkno + frags;
10765         LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10766                 /*
10767                  * Don't match against blocks that will be freed when the
10768                  * background write is done.
10769                  */
10770                 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10771                     (COMPLETE | DEPCOMPLETE))
10772                         continue;
10773                 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10774                 jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10775                 if ((blkno >= jstart && blkno < jend) ||
10776                     (end > jstart && end <= jend)) {
10777                         printf("state 0x%X %jd - %d %d dep %p\n",
10778                             jnewblk->jn_state, jnewblk->jn_blkno,
10779                             jnewblk->jn_oldfrags, jnewblk->jn_frags,
10780                             jnewblk->jn_dep);
10781                         panic("softdep_setup_blkfree: "
10782                             "%jd-%jd(%d) overlaps with %jd-%jd",
10783                             blkno, end, frags, jstart, jend);
10784                 }
10785         }
10786 #endif
10787         FREE_LOCK(ump);
10788 }
10789
10790 /*
10791  * Revert a block allocation when the journal record that describes it
10792  * is not yet written.
10793  */
10794 static int
10795 jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10796         struct jnewblk *jnewblk;
10797         struct fs *fs;
10798         struct cg *cgp;
10799         uint8_t *blksfree;
10800 {
10801         ufs1_daddr_t fragno;
10802         long cgbno, bbase;
10803         int frags, blk;
10804         int i;
10805
10806         frags = 0;
10807         cgbno = dtogd(fs, jnewblk->jn_blkno);
10808         /*
10809          * We have to test which frags need to be rolled back.  We may
10810          * be operating on a stale copy when doing background writes.
10811          */
10812         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10813                 if (isclr(blksfree, cgbno + i))
10814                         frags++;
10815         if (frags == 0)
10816                 return (0);
10817         /*
10818          * This is mostly ffs_blkfree() sans some validation and
10819          * superblock updates.
10820          */
10821         if (frags == fs->fs_frag) {
10822                 fragno = fragstoblks(fs, cgbno);
10823                 ffs_setblock(fs, blksfree, fragno);
10824                 ffs_clusteracct(fs, cgp, fragno, 1);
10825                 cgp->cg_cs.cs_nbfree++;
10826         } else {
10827                 cgbno += jnewblk->jn_oldfrags;
10828                 bbase = cgbno - fragnum(fs, cgbno);
10829                 /* Decrement the old frags.  */
10830                 blk = blkmap(fs, blksfree, bbase);
10831                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10832                 /* Deallocate the fragment */
10833                 for (i = 0; i < frags; i++)
10834                         setbit(blksfree, cgbno + i);
10835                 cgp->cg_cs.cs_nffree += frags;
10836                 /* Add back in counts associated with the new frags */
10837                 blk = blkmap(fs, blksfree, bbase);
10838                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10839                 /* If a complete block has been reassembled, account for it. */
10840                 fragno = fragstoblks(fs, bbase);
10841                 if (ffs_isblock(fs, blksfree, fragno)) {
10842                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
10843                         ffs_clusteracct(fs, cgp, fragno, 1);
10844                         cgp->cg_cs.cs_nbfree++;
10845                 }
10846         }
10847         stat_jnewblk++;
10848         jnewblk->jn_state &= ~ATTACHED;
10849         jnewblk->jn_state |= UNDONE;
10850
10851         return (frags);
10852 }
10853
10854 static void
10855 initiate_write_bmsafemap(bmsafemap, bp)
10856         struct bmsafemap *bmsafemap;
10857         struct buf *bp;                 /* The cg block. */
10858 {
10859         struct jaddref *jaddref;
10860         struct jnewblk *jnewblk;
10861         uint8_t *inosused;
10862         uint8_t *blksfree;
10863         struct cg *cgp;
10864         struct fs *fs;
10865         ino_t ino;
10866
10867         /*
10868          * If this is a background write, we did this at the time that
10869          * the copy was made, so do not need to do it again.
10870          */
10871         if (bmsafemap->sm_state & IOSTARTED)
10872                 return;
10873         bmsafemap->sm_state |= IOSTARTED;
10874         /*
10875          * Clear any inode allocations which are pending journal writes.
10876          */
10877         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10878                 cgp = (struct cg *)bp->b_data;
10879                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10880                 inosused = cg_inosused(cgp);
10881                 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10882                         ino = jaddref->ja_ino % fs->fs_ipg;
10883                         if (isset(inosused, ino)) {
10884                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
10885                                         cgp->cg_cs.cs_ndir--;
10886                                 cgp->cg_cs.cs_nifree++;
10887                                 clrbit(inosused, ino);
10888                                 jaddref->ja_state &= ~ATTACHED;
10889                                 jaddref->ja_state |= UNDONE;
10890                                 stat_jaddref++;
10891                         } else
10892                                 panic("initiate_write_bmsafemap: inode %ju "
10893                                     "marked free", (uintmax_t)jaddref->ja_ino);
10894                 }
10895         }
10896         /*
10897          * Clear any block allocations which are pending journal writes.
10898          */
10899         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10900                 cgp = (struct cg *)bp->b_data;
10901                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10902                 blksfree = cg_blksfree(cgp);
10903                 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10904                         if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10905                                 continue;
10906                         panic("initiate_write_bmsafemap: block %jd "
10907                             "marked free", jnewblk->jn_blkno);
10908                 }
10909         }
10910         /*
10911          * Move allocation lists to the written lists so they can be
10912          * cleared once the block write is complete.
10913          */
10914         LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10915             inodedep, id_deps);
10916         LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10917             newblk, nb_deps);
10918         LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10919             wk_list);
10920 }
10921
10922 /*
10923  * This routine is called during the completion interrupt
10924  * service routine for a disk write (from the procedure called
10925  * by the device driver to inform the filesystem caches of
10926  * a request completion).  It should be called early in this
10927  * procedure, before the block is made available to other
10928  * processes or other routines are called.
10929  *
10930  */
10931 static void
10932 softdep_disk_write_complete(bp)
10933         struct buf *bp;         /* describes the completed disk write */
10934 {
10935         struct worklist *wk;
10936         struct worklist *owk;
10937         struct ufsmount *ump;
10938         struct workhead reattach;
10939         struct freeblks *freeblks;
10940         struct buf *sbp;
10941
10942         /*
10943          * If an error occurred while doing the write, then the data
10944          * has not hit the disk and the dependencies cannot be processed.
10945          * But we do have to go through and roll forward any dependencies
10946          * that were rolled back before the disk write.
10947          */
10948         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
10949                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
10950                         switch (wk->wk_type) {
10951
10952                         case D_PAGEDEP:
10953                                 handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
10954                                 continue;
10955
10956                         case D_INODEDEP:
10957                                 handle_written_inodeblock(WK_INODEDEP(wk),
10958                                     bp, 0);
10959                                 continue;
10960
10961                         case D_BMSAFEMAP:
10962                                 handle_written_bmsafemap(WK_BMSAFEMAP(wk),
10963                                     bp, 0);
10964                                 continue;
10965
10966                         case D_INDIRDEP:
10967                                 handle_written_indirdep(WK_INDIRDEP(wk),
10968                                     bp, &sbp, 0);
10969                                 continue;
10970                         default:
10971                                 /* nothing to roll forward */
10972                                 continue;
10973                         }
10974                 }
10975                 return;
10976         }
10977         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
10978                 return;
10979         ump = VFSTOUFS(wk->wk_mp);
10980         LIST_INIT(&reattach);
10981         /*
10982          * This lock must not be released anywhere in this code segment.
10983          */
10984         sbp = NULL;
10985         owk = NULL;
10986         ACQUIRE_LOCK(ump);
10987         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10988                 WORKLIST_REMOVE(wk);
10989                 atomic_add_long(&dep_write[wk->wk_type], 1);
10990                 if (wk == owk)
10991                         panic("duplicate worklist: %p\n", wk);
10992                 owk = wk;
10993                 switch (wk->wk_type) {
10994
10995                 case D_PAGEDEP:
10996                         if (handle_written_filepage(WK_PAGEDEP(wk), bp,
10997                             WRITESUCCEEDED))
10998                                 WORKLIST_INSERT(&reattach, wk);
10999                         continue;
11000
11001                 case D_INODEDEP:
11002                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11003                             WRITESUCCEEDED))
11004                                 WORKLIST_INSERT(&reattach, wk);
11005                         continue;
11006
11007                 case D_BMSAFEMAP:
11008                         if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11009                             WRITESUCCEEDED))
11010                                 WORKLIST_INSERT(&reattach, wk);
11011                         continue;
11012
11013                 case D_MKDIR:
11014                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11015                         continue;
11016
11017                 case D_ALLOCDIRECT:
11018                         wk->wk_state |= COMPLETE;
11019                         handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11020                         continue;
11021
11022                 case D_ALLOCINDIR:
11023                         wk->wk_state |= COMPLETE;
11024                         handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11025                         continue;
11026
11027                 case D_INDIRDEP:
11028                         if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11029                             WRITESUCCEEDED))
11030                                 WORKLIST_INSERT(&reattach, wk);
11031                         continue;
11032
11033                 case D_FREEBLKS:
11034                         wk->wk_state |= COMPLETE;
11035                         freeblks = WK_FREEBLKS(wk);
11036                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11037                             LIST_EMPTY(&freeblks->fb_jblkdephd))
11038                                 add_to_worklist(wk, WK_NODELAY);
11039                         continue;
11040
11041                 case D_FREEWORK:
11042                         handle_written_freework(WK_FREEWORK(wk));
11043                         break;
11044
11045                 case D_JSEGDEP:
11046                         free_jsegdep(WK_JSEGDEP(wk));
11047                         continue;
11048
11049                 case D_JSEG:
11050                         handle_written_jseg(WK_JSEG(wk), bp);
11051                         continue;
11052
11053                 case D_SBDEP:
11054                         if (handle_written_sbdep(WK_SBDEP(wk), bp))
11055                                 WORKLIST_INSERT(&reattach, wk);
11056                         continue;
11057
11058                 case D_FREEDEP:
11059                         free_freedep(WK_FREEDEP(wk));
11060                         continue;
11061
11062                 default:
11063                         panic("handle_disk_write_complete: Unknown type %s",
11064                             TYPENAME(wk->wk_type));
11065                         /* NOTREACHED */
11066                 }
11067         }
11068         /*
11069          * Reattach any requests that must be redone.
11070          */
11071         while ((wk = LIST_FIRST(&reattach)) != NULL) {
11072                 WORKLIST_REMOVE(wk);
11073                 WORKLIST_INSERT(&bp->b_dep, wk);
11074         }
11075         FREE_LOCK(ump);
11076         if (sbp)
11077                 brelse(sbp);
11078 }
11079
11080 /*
11081  * Called from within softdep_disk_write_complete above. Note that
11082  * this routine is always called from interrupt level with further
11083  * splbio interrupts blocked.
11084  */
11085 static void
11086 handle_allocdirect_partdone(adp, wkhd)
11087         struct allocdirect *adp;        /* the completed allocdirect */
11088         struct workhead *wkhd;          /* Work to do when inode is writtne. */
11089 {
11090         struct allocdirectlst *listhead;
11091         struct allocdirect *listadp;
11092         struct inodedep *inodedep;
11093         long bsize;
11094
11095         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11096                 return;
11097         /*
11098          * The on-disk inode cannot claim to be any larger than the last
11099          * fragment that has been written. Otherwise, the on-disk inode
11100          * might have fragments that were not the last block in the file
11101          * which would corrupt the filesystem. Thus, we cannot free any
11102          * allocdirects after one whose ad_oldblkno claims a fragment as
11103          * these blocks must be rolled back to zero before writing the inode.
11104          * We check the currently active set of allocdirects in id_inoupdt
11105          * or id_extupdt as appropriate.
11106          */
11107         inodedep = adp->ad_inodedep;
11108         bsize = inodedep->id_fs->fs_bsize;
11109         if (adp->ad_state & EXTDATA)
11110                 listhead = &inodedep->id_extupdt;
11111         else
11112                 listhead = &inodedep->id_inoupdt;
11113         TAILQ_FOREACH(listadp, listhead, ad_next) {
11114                 /* found our block */
11115                 if (listadp == adp)
11116                         break;
11117                 /* continue if ad_oldlbn is not a fragment */
11118                 if (listadp->ad_oldsize == 0 ||
11119                     listadp->ad_oldsize == bsize)
11120                         continue;
11121                 /* hit a fragment */
11122                 return;
11123         }
11124         /*
11125          * If we have reached the end of the current list without
11126          * finding the just finished dependency, then it must be
11127          * on the future dependency list. Future dependencies cannot
11128          * be freed until they are moved to the current list.
11129          */
11130         if (listadp == NULL) {
11131 #ifdef DEBUG
11132                 if (adp->ad_state & EXTDATA)
11133                         listhead = &inodedep->id_newextupdt;
11134                 else
11135                         listhead = &inodedep->id_newinoupdt;
11136                 TAILQ_FOREACH(listadp, listhead, ad_next)
11137                         /* found our block */
11138                         if (listadp == adp)
11139                                 break;
11140                 if (listadp == NULL)
11141                         panic("handle_allocdirect_partdone: lost dep");
11142 #endif /* DEBUG */
11143                 return;
11144         }
11145         /*
11146          * If we have found the just finished dependency, then queue
11147          * it along with anything that follows it that is complete.
11148          * Since the pointer has not yet been written in the inode
11149          * as the dependency prevents it, place the allocdirect on the
11150          * bufwait list where it will be freed once the pointer is
11151          * valid.
11152          */
11153         if (wkhd == NULL)
11154                 wkhd = &inodedep->id_bufwait;
11155         for (; adp; adp = listadp) {
11156                 listadp = TAILQ_NEXT(adp, ad_next);
11157                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11158                         return;
11159                 TAILQ_REMOVE(listhead, adp, ad_next);
11160                 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11161         }
11162 }
11163
11164 /*
11165  * Called from within softdep_disk_write_complete above.  This routine
11166  * completes successfully written allocindirs.
11167  */
11168 static void
11169 handle_allocindir_partdone(aip)
11170         struct allocindir *aip;         /* the completed allocindir */
11171 {
11172         struct indirdep *indirdep;
11173
11174         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11175                 return;
11176         indirdep = aip->ai_indirdep;
11177         LIST_REMOVE(aip, ai_next);
11178         /*
11179          * Don't set a pointer while the buffer is undergoing IO or while
11180          * we have active truncations.
11181          */
11182         if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11183                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11184                 return;
11185         }
11186         if (indirdep->ir_state & UFS1FMT)
11187                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11188                     aip->ai_newblkno;
11189         else
11190                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11191                     aip->ai_newblkno;
11192         /*
11193          * Await the pointer write before freeing the allocindir.
11194          */
11195         LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11196 }
11197
11198 /*
11199  * Release segments held on a jwork list.
11200  */
11201 static void
11202 handle_jwork(wkhd)
11203         struct workhead *wkhd;
11204 {
11205         struct worklist *wk;
11206
11207         while ((wk = LIST_FIRST(wkhd)) != NULL) {
11208                 WORKLIST_REMOVE(wk);
11209                 switch (wk->wk_type) {
11210                 case D_JSEGDEP:
11211                         free_jsegdep(WK_JSEGDEP(wk));
11212                         continue;
11213                 case D_FREEDEP:
11214                         free_freedep(WK_FREEDEP(wk));
11215                         continue;
11216                 case D_FREEFRAG:
11217                         rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11218                         WORKITEM_FREE(wk, D_FREEFRAG);
11219                         continue;
11220                 case D_FREEWORK:
11221                         handle_written_freework(WK_FREEWORK(wk));
11222                         continue;
11223                 default:
11224                         panic("handle_jwork: Unknown type %s\n",
11225                             TYPENAME(wk->wk_type));
11226                 }
11227         }
11228 }
11229
11230 /*
11231  * Handle the bufwait list on an inode when it is safe to release items
11232  * held there.  This normally happens after an inode block is written but
11233  * may be delayed and handled later if there are pending journal items that
11234  * are not yet safe to be released.
11235  */
11236 static struct freefile *
11237 handle_bufwait(inodedep, refhd)
11238         struct inodedep *inodedep;
11239         struct workhead *refhd;
11240 {
11241         struct jaddref *jaddref;
11242         struct freefile *freefile;
11243         struct worklist *wk;
11244
11245         freefile = NULL;
11246         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11247                 WORKLIST_REMOVE(wk);
11248                 switch (wk->wk_type) {
11249                 case D_FREEFILE:
11250                         /*
11251                          * We defer adding freefile to the worklist
11252                          * until all other additions have been made to
11253                          * ensure that it will be done after all the
11254                          * old blocks have been freed.
11255                          */
11256                         if (freefile != NULL)
11257                                 panic("handle_bufwait: freefile");
11258                         freefile = WK_FREEFILE(wk);
11259                         continue;
11260
11261                 case D_MKDIR:
11262                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11263                         continue;
11264
11265                 case D_DIRADD:
11266                         diradd_inode_written(WK_DIRADD(wk), inodedep);
11267                         continue;
11268
11269                 case D_FREEFRAG:
11270                         wk->wk_state |= COMPLETE;
11271                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11272                                 add_to_worklist(wk, 0);
11273                         continue;
11274
11275                 case D_DIRREM:
11276                         wk->wk_state |= COMPLETE;
11277                         add_to_worklist(wk, 0);
11278                         continue;
11279
11280                 case D_ALLOCDIRECT:
11281                 case D_ALLOCINDIR:
11282                         free_newblk(WK_NEWBLK(wk));
11283                         continue;
11284
11285                 case D_JNEWBLK:
11286                         wk->wk_state |= COMPLETE;
11287                         free_jnewblk(WK_JNEWBLK(wk));
11288                         continue;
11289
11290                 /*
11291                  * Save freed journal segments and add references on
11292                  * the supplied list which will delay their release
11293                  * until the cg bitmap is cleared on disk.
11294                  */
11295                 case D_JSEGDEP:
11296                         if (refhd == NULL)
11297                                 free_jsegdep(WK_JSEGDEP(wk));
11298                         else
11299                                 WORKLIST_INSERT(refhd, wk);
11300                         continue;
11301
11302                 case D_JADDREF:
11303                         jaddref = WK_JADDREF(wk);
11304                         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11305                             if_deps);
11306                         /*
11307                          * Transfer any jaddrefs to the list to be freed with
11308                          * the bitmap if we're handling a removed file.
11309                          */
11310                         if (refhd == NULL) {
11311                                 wk->wk_state |= COMPLETE;
11312                                 free_jaddref(jaddref);
11313                         } else
11314                                 WORKLIST_INSERT(refhd, wk);
11315                         continue;
11316
11317                 default:
11318                         panic("handle_bufwait: Unknown type %p(%s)",
11319                             wk, TYPENAME(wk->wk_type));
11320                         /* NOTREACHED */
11321                 }
11322         }
11323         return (freefile);
11324 }
11325 /*
11326  * Called from within softdep_disk_write_complete above to restore
11327  * in-memory inode block contents to their most up-to-date state. Note
11328  * that this routine is always called from interrupt level with further
11329  * interrupts from this device blocked.
11330  *
11331  * If the write did not succeed, we will do all the roll-forward
11332  * operations, but we will not take the actions that will allow its
11333  * dependencies to be processed.
11334  */
11335 static int
11336 handle_written_inodeblock(inodedep, bp, flags)
11337         struct inodedep *inodedep;
11338         struct buf *bp;         /* buffer containing the inode block */
11339         int flags;
11340 {
11341         struct freefile *freefile;
11342         struct allocdirect *adp, *nextadp;
11343         struct ufs1_dinode *dp1 = NULL;
11344         struct ufs2_dinode *dp2 = NULL;
11345         struct workhead wkhd;
11346         int hadchanges, fstype;
11347         ino_t freelink;
11348
11349         LIST_INIT(&wkhd);
11350         hadchanges = 0;
11351         freefile = NULL;
11352         if ((inodedep->id_state & IOSTARTED) == 0)
11353                 panic("handle_written_inodeblock: not started");
11354         inodedep->id_state &= ~IOSTARTED;
11355         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11356                 fstype = UFS1;
11357                 dp1 = (struct ufs1_dinode *)bp->b_data +
11358                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11359                 freelink = dp1->di_freelink;
11360         } else {
11361                 fstype = UFS2;
11362                 dp2 = (struct ufs2_dinode *)bp->b_data +
11363                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11364                 freelink = dp2->di_freelink;
11365         }
11366         /*
11367          * Leave this inodeblock dirty until it's in the list.
11368          */
11369         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11370             (flags & WRITESUCCEEDED)) {
11371                 struct inodedep *inon;
11372
11373                 inon = TAILQ_NEXT(inodedep, id_unlinked);
11374                 if ((inon == NULL && freelink == 0) ||
11375                     (inon && inon->id_ino == freelink)) {
11376                         if (inon)
11377                                 inon->id_state |= UNLINKPREV;
11378                         inodedep->id_state |= UNLINKNEXT;
11379                 }
11380                 hadchanges = 1;
11381         }
11382         /*
11383          * If we had to rollback the inode allocation because of
11384          * bitmaps being incomplete, then simply restore it.
11385          * Keep the block dirty so that it will not be reclaimed until
11386          * all associated dependencies have been cleared and the
11387          * corresponding updates written to disk.
11388          */
11389         if (inodedep->id_savedino1 != NULL) {
11390                 hadchanges = 1;
11391                 if (fstype == UFS1)
11392                         *dp1 = *inodedep->id_savedino1;
11393                 else
11394                         *dp2 = *inodedep->id_savedino2;
11395                 free(inodedep->id_savedino1, M_SAVEDINO);
11396                 inodedep->id_savedino1 = NULL;
11397                 if ((bp->b_flags & B_DELWRI) == 0)
11398                         stat_inode_bitmap++;
11399                 bdirty(bp);
11400                 /*
11401                  * If the inode is clear here and GOINGAWAY it will never
11402                  * be written.  Process the bufwait and clear any pending
11403                  * work which may include the freefile.
11404                  */
11405                 if (inodedep->id_state & GOINGAWAY)
11406                         goto bufwait;
11407                 return (1);
11408         }
11409         if (flags & WRITESUCCEEDED)
11410                 inodedep->id_state |= COMPLETE;
11411         /*
11412          * Roll forward anything that had to be rolled back before
11413          * the inode could be updated.
11414          */
11415         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11416                 nextadp = TAILQ_NEXT(adp, ad_next);
11417                 if (adp->ad_state & ATTACHED)
11418                         panic("handle_written_inodeblock: new entry");
11419                 if (fstype == UFS1) {
11420                         if (adp->ad_offset < NDADDR) {
11421                                 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11422                                         panic("%s %s #%jd mismatch %d != %jd",
11423                                             "handle_written_inodeblock:",
11424                                             "direct pointer",
11425                                             (intmax_t)adp->ad_offset,
11426                                             dp1->di_db[adp->ad_offset],
11427                                             (intmax_t)adp->ad_oldblkno);
11428                                 dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11429                         } else {
11430                                 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11431                                         panic("%s: %s #%jd allocated as %d",
11432                                             "handle_written_inodeblock",
11433                                             "indirect pointer",
11434                                             (intmax_t)adp->ad_offset - NDADDR,
11435                                             dp1->di_ib[adp->ad_offset - NDADDR]);
11436                                 dp1->di_ib[adp->ad_offset - NDADDR] =
11437                                     adp->ad_newblkno;
11438                         }
11439                 } else {
11440                         if (adp->ad_offset < NDADDR) {
11441                                 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11442                                         panic("%s: %s #%jd %s %jd != %jd",
11443                                             "handle_written_inodeblock",
11444                                             "direct pointer",
11445                                             (intmax_t)adp->ad_offset, "mismatch",
11446                                             (intmax_t)dp2->di_db[adp->ad_offset],
11447                                             (intmax_t)adp->ad_oldblkno);
11448                                 dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11449                         } else {
11450                                 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11451                                         panic("%s: %s #%jd allocated as %jd",
11452                                             "handle_written_inodeblock",
11453                                             "indirect pointer",
11454                                             (intmax_t)adp->ad_offset - NDADDR,
11455                                             (intmax_t)
11456                                             dp2->di_ib[adp->ad_offset - NDADDR]);
11457                                 dp2->di_ib[adp->ad_offset - NDADDR] =
11458                                     adp->ad_newblkno;
11459                         }
11460                 }
11461                 adp->ad_state &= ~UNDONE;
11462                 adp->ad_state |= ATTACHED;
11463                 hadchanges = 1;
11464         }
11465         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11466                 nextadp = TAILQ_NEXT(adp, ad_next);
11467                 if (adp->ad_state & ATTACHED)
11468                         panic("handle_written_inodeblock: new entry");
11469                 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11470                         panic("%s: direct pointers #%jd %s %jd != %jd",
11471                             "handle_written_inodeblock",
11472                             (intmax_t)adp->ad_offset, "mismatch",
11473                             (intmax_t)dp2->di_extb[adp->ad_offset],
11474                             (intmax_t)adp->ad_oldblkno);
11475                 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11476                 adp->ad_state &= ~UNDONE;
11477                 adp->ad_state |= ATTACHED;
11478                 hadchanges = 1;
11479         }
11480         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11481                 stat_direct_blk_ptrs++;
11482         /*
11483          * Reset the file size to its most up-to-date value.
11484          */
11485         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11486                 panic("handle_written_inodeblock: bad size");
11487         if (inodedep->id_savednlink > LINK_MAX)
11488                 panic("handle_written_inodeblock: Invalid link count "
11489                     "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11490         if (fstype == UFS1) {
11491                 if (dp1->di_nlink != inodedep->id_savednlink) {
11492                         dp1->di_nlink = inodedep->id_savednlink;
11493                         hadchanges = 1;
11494                 }
11495                 if (dp1->di_size != inodedep->id_savedsize) {
11496                         dp1->di_size = inodedep->id_savedsize;
11497                         hadchanges = 1;
11498                 }
11499         } else {
11500                 if (dp2->di_nlink != inodedep->id_savednlink) {
11501                         dp2->di_nlink = inodedep->id_savednlink;
11502                         hadchanges = 1;
11503                 }
11504                 if (dp2->di_size != inodedep->id_savedsize) {
11505                         dp2->di_size = inodedep->id_savedsize;
11506                         hadchanges = 1;
11507                 }
11508                 if (dp2->di_extsize != inodedep->id_savedextsize) {
11509                         dp2->di_extsize = inodedep->id_savedextsize;
11510                         hadchanges = 1;
11511                 }
11512         }
11513         inodedep->id_savedsize = -1;
11514         inodedep->id_savedextsize = -1;
11515         inodedep->id_savednlink = -1;
11516         /*
11517          * If there were any rollbacks in the inode block, then it must be
11518          * marked dirty so that its will eventually get written back in
11519          * its correct form.
11520          */
11521         if (hadchanges)
11522                 bdirty(bp);
11523 bufwait:
11524         /*
11525          * If the write did not succeed, we have done all the roll-forward
11526          * operations, but we cannot take the actions that will allow its
11527          * dependencies to be processed.
11528          */
11529         if ((flags & WRITESUCCEEDED) == 0)
11530                 return (hadchanges);
11531         /*
11532          * Process any allocdirects that completed during the update.
11533          */
11534         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11535                 handle_allocdirect_partdone(adp, &wkhd);
11536         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11537                 handle_allocdirect_partdone(adp, &wkhd);
11538         /*
11539          * Process deallocations that were held pending until the
11540          * inode had been written to disk. Freeing of the inode
11541          * is delayed until after all blocks have been freed to
11542          * avoid creation of new <vfsid, inum, lbn> triples
11543          * before the old ones have been deleted.  Completely
11544          * unlinked inodes are not processed until the unlinked
11545          * inode list is written or the last reference is removed.
11546          */
11547         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11548                 freefile = handle_bufwait(inodedep, NULL);
11549                 if (freefile && !LIST_EMPTY(&wkhd)) {
11550                         WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11551                         freefile = NULL;
11552                 }
11553         }
11554         /*
11555          * Move rolled forward dependency completions to the bufwait list
11556          * now that those that were already written have been processed.
11557          */
11558         if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11559                 panic("handle_written_inodeblock: bufwait but no changes");
11560         jwork_move(&inodedep->id_bufwait, &wkhd);
11561
11562         if (freefile != NULL) {
11563                 /*
11564                  * If the inode is goingaway it was never written.  Fake up
11565                  * the state here so free_inodedep() can succeed.
11566                  */
11567                 if (inodedep->id_state & GOINGAWAY)
11568                         inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11569                 if (free_inodedep(inodedep) == 0)
11570                         panic("handle_written_inodeblock: live inodedep %p",
11571                             inodedep);
11572                 add_to_worklist(&freefile->fx_list, 0);
11573                 return (0);
11574         }
11575
11576         /*
11577          * If no outstanding dependencies, free it.
11578          */
11579         if (free_inodedep(inodedep) ||
11580             (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11581              TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11582              TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11583              LIST_FIRST(&inodedep->id_bufwait) == 0))
11584                 return (0);
11585         return (hadchanges);
11586 }
11587
11588 /*
11589  * Perform needed roll-forwards and kick off any dependencies that
11590  * can now be processed.
11591  *
11592  * If the write did not succeed, we will do all the roll-forward
11593  * operations, but we will not take the actions that will allow its
11594  * dependencies to be processed.
11595  */
11596 static int
11597 handle_written_indirdep(indirdep, bp, bpp, flags)
11598         struct indirdep *indirdep;
11599         struct buf *bp;
11600         struct buf **bpp;
11601         int flags;
11602 {
11603         struct allocindir *aip;
11604         struct buf *sbp;
11605         int chgs;
11606
11607         if (indirdep->ir_state & GOINGAWAY)
11608                 panic("handle_written_indirdep: indirdep gone");
11609         if ((indirdep->ir_state & IOSTARTED) == 0)
11610                 panic("handle_written_indirdep: IO not started");
11611         chgs = 0;
11612         /*
11613          * If there were rollbacks revert them here.
11614          */
11615         if (indirdep->ir_saveddata) {
11616                 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11617                 if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11618                         free(indirdep->ir_saveddata, M_INDIRDEP);
11619                         indirdep->ir_saveddata = NULL;
11620                 }
11621                 chgs = 1;
11622         }
11623         indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11624         indirdep->ir_state |= ATTACHED;
11625         /*
11626          * If the write did not succeed, we have done all the roll-forward
11627          * operations, but we cannot take the actions that will allow its
11628          * dependencies to be processed.
11629          */
11630         if ((flags & WRITESUCCEEDED) == 0) {
11631                 stat_indir_blk_ptrs++;
11632                 bdirty(bp);
11633                 return (1);
11634         }
11635         /*
11636          * Move allocindirs with written pointers to the completehd if
11637          * the indirdep's pointer is not yet written.  Otherwise
11638          * free them here.
11639          */
11640         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11641                 LIST_REMOVE(aip, ai_next);
11642                 if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11643                         LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11644                             ai_next);
11645                         newblk_freefrag(&aip->ai_block);
11646                         continue;
11647                 }
11648                 free_newblk(&aip->ai_block);
11649         }
11650         /*
11651          * Move allocindirs that have finished dependency processing from
11652          * the done list to the write list after updating the pointers.
11653          */
11654         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11655                 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11656                         handle_allocindir_partdone(aip);
11657                         if (aip == LIST_FIRST(&indirdep->ir_donehd))
11658                                 panic("disk_write_complete: not gone");
11659                         chgs = 1;
11660                 }
11661         }
11662         /*
11663          * Preserve the indirdep if there were any changes or if it is not
11664          * yet valid on disk.
11665          */
11666         if (chgs) {
11667                 stat_indir_blk_ptrs++;
11668                 bdirty(bp);
11669                 return (1);
11670         }
11671         /*
11672          * If there were no changes we can discard the savedbp and detach
11673          * ourselves from the buf.  We are only carrying completed pointers
11674          * in this case.
11675          */
11676         sbp = indirdep->ir_savebp;
11677         sbp->b_flags |= B_INVAL | B_NOCACHE;
11678         indirdep->ir_savebp = NULL;
11679         indirdep->ir_bp = NULL;
11680         if (*bpp != NULL)
11681                 panic("handle_written_indirdep: bp already exists.");
11682         *bpp = sbp;
11683         /*
11684          * The indirdep may not be freed until its parent points at it.
11685          */
11686         if (indirdep->ir_state & DEPCOMPLETE)
11687                 free_indirdep(indirdep);
11688
11689         return (0);
11690 }
11691
11692 /*
11693  * Process a diradd entry after its dependent inode has been written.
11694  * This routine must be called with splbio interrupts blocked.
11695  */
11696 static void
11697 diradd_inode_written(dap, inodedep)
11698         struct diradd *dap;
11699         struct inodedep *inodedep;
11700 {
11701
11702         dap->da_state |= COMPLETE;
11703         complete_diradd(dap);
11704         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11705 }
11706
11707 /*
11708  * Returns true if the bmsafemap will have rollbacks when written.  Must only
11709  * be called with the per-filesystem lock and the buf lock on the cg held.
11710  */
11711 static int
11712 bmsafemap_backgroundwrite(bmsafemap, bp)
11713         struct bmsafemap *bmsafemap;
11714         struct buf *bp;
11715 {
11716         int dirty;
11717
11718         LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11719         dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11720             !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11721         /*
11722          * If we're initiating a background write we need to process the
11723          * rollbacks as they exist now, not as they exist when IO starts.
11724          * No other consumers will look at the contents of the shadowed
11725          * buf so this is safe to do here.
11726          */
11727         if (bp->b_xflags & BX_BKGRDMARKER)
11728                 initiate_write_bmsafemap(bmsafemap, bp);
11729
11730         return (dirty);
11731 }
11732
11733 /*
11734  * Re-apply an allocation when a cg write is complete.
11735  */
11736 static int
11737 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11738         struct jnewblk *jnewblk;
11739         struct fs *fs;
11740         struct cg *cgp;
11741         uint8_t *blksfree;
11742 {
11743         ufs1_daddr_t fragno;
11744         ufs2_daddr_t blkno;
11745         long cgbno, bbase;
11746         int frags, blk;
11747         int i;
11748
11749         frags = 0;
11750         cgbno = dtogd(fs, jnewblk->jn_blkno);
11751         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11752                 if (isclr(blksfree, cgbno + i))
11753                         panic("jnewblk_rollforward: re-allocated fragment");
11754                 frags++;
11755         }
11756         if (frags == fs->fs_frag) {
11757                 blkno = fragstoblks(fs, cgbno);
11758                 ffs_clrblock(fs, blksfree, (long)blkno);
11759                 ffs_clusteracct(fs, cgp, blkno, -1);
11760                 cgp->cg_cs.cs_nbfree--;
11761         } else {
11762                 bbase = cgbno - fragnum(fs, cgbno);
11763                 cgbno += jnewblk->jn_oldfrags;
11764                 /* If a complete block had been reassembled, account for it. */
11765                 fragno = fragstoblks(fs, bbase);
11766                 if (ffs_isblock(fs, blksfree, fragno)) {
11767                         cgp->cg_cs.cs_nffree += fs->fs_frag;
11768                         ffs_clusteracct(fs, cgp, fragno, -1);
11769                         cgp->cg_cs.cs_nbfree--;
11770                 }
11771                 /* Decrement the old frags.  */
11772                 blk = blkmap(fs, blksfree, bbase);
11773                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11774                 /* Allocate the fragment */
11775                 for (i = 0; i < frags; i++)
11776                         clrbit(blksfree, cgbno + i);
11777                 cgp->cg_cs.cs_nffree -= frags;
11778                 /* Add back in counts associated with the new frags */
11779                 blk = blkmap(fs, blksfree, bbase);
11780                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11781         }
11782         return (frags);
11783 }
11784
11785 /*
11786  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11787  * changes if it's not a background write.  Set all written dependencies
11788  * to DEPCOMPLETE and free the structure if possible.
11789  *
11790  * If the write did not succeed, we will do all the roll-forward
11791  * operations, but we will not take the actions that will allow its
11792  * dependencies to be processed.
11793  */
11794 static int
11795 handle_written_bmsafemap(bmsafemap, bp, flags)
11796         struct bmsafemap *bmsafemap;
11797         struct buf *bp;
11798         int flags;
11799 {
11800         struct newblk *newblk;
11801         struct inodedep *inodedep;
11802         struct jaddref *jaddref, *jatmp;
11803         struct jnewblk *jnewblk, *jntmp;
11804         struct ufsmount *ump;
11805         uint8_t *inosused;
11806         uint8_t *blksfree;
11807         struct cg *cgp;
11808         struct fs *fs;
11809         ino_t ino;
11810         int foreground;
11811         int chgs;
11812
11813         if ((bmsafemap->sm_state & IOSTARTED) == 0)
11814                 panic("handle_written_bmsafemap: Not started\n");
11815         ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11816         chgs = 0;
11817         bmsafemap->sm_state &= ~IOSTARTED;
11818         foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11819         /*
11820          * If write was successful, release journal work that was waiting
11821          * on the write. Otherwise move the work back.
11822          */
11823         if (flags & WRITESUCCEEDED)
11824                 handle_jwork(&bmsafemap->sm_freewr);
11825         else
11826                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11827                     worklist, wk_list);
11828
11829         /*
11830          * Restore unwritten inode allocation pending jaddref writes.
11831          */
11832         if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11833                 cgp = (struct cg *)bp->b_data;
11834                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11835                 inosused = cg_inosused(cgp);
11836                 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11837                     ja_bmdeps, jatmp) {
11838                         if ((jaddref->ja_state & UNDONE) == 0)
11839                                 continue;
11840                         ino = jaddref->ja_ino % fs->fs_ipg;
11841                         if (isset(inosused, ino))
11842                                 panic("handle_written_bmsafemap: "
11843                                     "re-allocated inode");
11844                         /* Do the roll-forward only if it's a real copy. */
11845                         if (foreground) {
11846                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
11847                                         cgp->cg_cs.cs_ndir++;
11848                                 cgp->cg_cs.cs_nifree--;
11849                                 setbit(inosused, ino);
11850                                 chgs = 1;
11851                         }
11852                         jaddref->ja_state &= ~UNDONE;
11853                         jaddref->ja_state |= ATTACHED;
11854                         free_jaddref(jaddref);
11855                 }
11856         }
11857         /*
11858          * Restore any block allocations which are pending journal writes.
11859          */
11860         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11861                 cgp = (struct cg *)bp->b_data;
11862                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11863                 blksfree = cg_blksfree(cgp);
11864                 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11865                     jntmp) {
11866                         if ((jnewblk->jn_state & UNDONE) == 0)
11867                                 continue;
11868                         /* Do the roll-forward only if it's a real copy. */
11869                         if (foreground &&
11870                             jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11871                                 chgs = 1;
11872                         jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11873                         jnewblk->jn_state |= ATTACHED;
11874                         free_jnewblk(jnewblk);
11875                 }
11876         }
11877         /*
11878          * If the write did not succeed, we have done all the roll-forward
11879          * operations, but we cannot take the actions that will allow its
11880          * dependencies to be processed.
11881          */
11882         if ((flags & WRITESUCCEEDED) == 0) {
11883                 LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11884                     newblk, nb_deps);
11885                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11886                     worklist, wk_list);
11887                 if (foreground)
11888                         bdirty(bp);
11889                 return (1);
11890         }
11891         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11892                 newblk->nb_state |= DEPCOMPLETE;
11893                 newblk->nb_state &= ~ONDEPLIST;
11894                 newblk->nb_bmsafemap = NULL;
11895                 LIST_REMOVE(newblk, nb_deps);
11896                 if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11897                         handle_allocdirect_partdone(
11898                             WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11899                 else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11900                         handle_allocindir_partdone(
11901                             WK_ALLOCINDIR(&newblk->nb_list));
11902                 else if (newblk->nb_list.wk_type != D_NEWBLK)
11903                         panic("handle_written_bmsafemap: Unexpected type: %s",
11904                             TYPENAME(newblk->nb_list.wk_type));
11905         }
11906         while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11907                 inodedep->id_state |= DEPCOMPLETE;
11908                 inodedep->id_state &= ~ONDEPLIST;
11909                 LIST_REMOVE(inodedep, id_deps);
11910                 inodedep->id_bmsafemap = NULL;
11911         }
11912         LIST_REMOVE(bmsafemap, sm_next);
11913         if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11914             LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11915             LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11916             LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11917             LIST_EMPTY(&bmsafemap->sm_freehd)) {
11918                 LIST_REMOVE(bmsafemap, sm_hash);
11919                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11920                 return (0);
11921         }
11922         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11923         if (foreground)
11924                 bdirty(bp);
11925         return (1);
11926 }
11927
11928 /*
11929  * Try to free a mkdir dependency.
11930  */
11931 static void
11932 complete_mkdir(mkdir)
11933         struct mkdir *mkdir;
11934 {
11935         struct diradd *dap;
11936
11937         if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11938                 return;
11939         LIST_REMOVE(mkdir, md_mkdirs);
11940         dap = mkdir->md_diradd;
11941         dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11942         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11943                 dap->da_state |= DEPCOMPLETE;
11944                 complete_diradd(dap);
11945         }
11946         WORKITEM_FREE(mkdir, D_MKDIR);
11947 }
11948
11949 /*
11950  * Handle the completion of a mkdir dependency.
11951  */
11952 static void
11953 handle_written_mkdir(mkdir, type)
11954         struct mkdir *mkdir;
11955         int type;
11956 {
11957
11958         if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11959                 panic("handle_written_mkdir: bad type");
11960         mkdir->md_state |= COMPLETE;
11961         complete_mkdir(mkdir);
11962 }
11963
11964 static int
11965 free_pagedep(pagedep)
11966         struct pagedep *pagedep;
11967 {
11968         int i;
11969
11970         if (pagedep->pd_state & NEWBLOCK)
11971                 return (0);
11972         if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11973                 return (0);
11974         for (i = 0; i < DAHASHSZ; i++)
11975                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11976                         return (0);
11977         if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11978                 return (0);
11979         if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11980                 return (0);
11981         if (pagedep->pd_state & ONWORKLIST)
11982                 WORKLIST_REMOVE(&pagedep->pd_list);
11983         LIST_REMOVE(pagedep, pd_hash);
11984         WORKITEM_FREE(pagedep, D_PAGEDEP);
11985
11986         return (1);
11987 }
11988
11989 /*
11990  * Called from within softdep_disk_write_complete above.
11991  * A write operation was just completed. Removed inodes can
11992  * now be freed and associated block pointers may be committed.
11993  * Note that this routine is always called from interrupt level
11994  * with further interrupts from this device blocked.
11995  *
11996  * If the write did not succeed, we will do all the roll-forward
11997  * operations, but we will not take the actions that will allow its
11998  * dependencies to be processed.
11999  */
12000 static int
12001 handle_written_filepage(pagedep, bp, flags)
12002         struct pagedep *pagedep;
12003         struct buf *bp;         /* buffer containing the written page */
12004         int flags;
12005 {
12006         struct dirrem *dirrem;
12007         struct diradd *dap, *nextdap;
12008         struct direct *ep;
12009         int i, chgs;
12010
12011         if ((pagedep->pd_state & IOSTARTED) == 0)
12012                 panic("handle_written_filepage: not started");
12013         pagedep->pd_state &= ~IOSTARTED;
12014         if ((flags & WRITESUCCEEDED) == 0)
12015                 goto rollforward;
12016         /*
12017          * Process any directory removals that have been committed.
12018          */
12019         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12020                 LIST_REMOVE(dirrem, dm_next);
12021                 dirrem->dm_state |= COMPLETE;
12022                 dirrem->dm_dirinum = pagedep->pd_ino;
12023                 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12024                     ("handle_written_filepage: Journal entries not written."));
12025                 add_to_worklist(&dirrem->dm_list, 0);
12026         }
12027         /*
12028          * Free any directory additions that have been committed.
12029          * If it is a newly allocated block, we have to wait until
12030          * the on-disk directory inode claims the new block.
12031          */
12032         if ((pagedep->pd_state & NEWBLOCK) == 0)
12033                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12034                         free_diradd(dap, NULL);
12035 rollforward:
12036         /*
12037          * Uncommitted directory entries must be restored.
12038          */
12039         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12040                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12041                      dap = nextdap) {
12042                         nextdap = LIST_NEXT(dap, da_pdlist);
12043                         if (dap->da_state & ATTACHED)
12044                                 panic("handle_written_filepage: attached");
12045                         ep = (struct direct *)
12046                             ((char *)bp->b_data + dap->da_offset);
12047                         ep->d_ino = dap->da_newinum;
12048                         dap->da_state &= ~UNDONE;
12049                         dap->da_state |= ATTACHED;
12050                         chgs = 1;
12051                         /*
12052                          * If the inode referenced by the directory has
12053                          * been written out, then the dependency can be
12054                          * moved to the pending list.
12055                          */
12056                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12057                                 LIST_REMOVE(dap, da_pdlist);
12058                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12059                                     da_pdlist);
12060                         }
12061                 }
12062         }
12063         /*
12064          * If there were any rollbacks in the directory, then it must be
12065          * marked dirty so that its will eventually get written back in
12066          * its correct form.
12067          */
12068         if (chgs || (flags & WRITESUCCEEDED) == 0) {
12069                 if ((bp->b_flags & B_DELWRI) == 0)
12070                         stat_dir_entry++;
12071                 bdirty(bp);
12072                 return (1);
12073         }
12074         /*
12075          * If we are not waiting for a new directory block to be
12076          * claimed by its inode, then the pagedep will be freed.
12077          * Otherwise it will remain to track any new entries on
12078          * the page in case they are fsync'ed.
12079          */
12080         free_pagedep(pagedep);
12081         return (0);
12082 }
12083
12084 /*
12085  * Writing back in-core inode structures.
12086  *
12087  * The filesystem only accesses an inode's contents when it occupies an
12088  * "in-core" inode structure.  These "in-core" structures are separate from
12089  * the page frames used to cache inode blocks.  Only the latter are
12090  * transferred to/from the disk.  So, when the updated contents of the
12091  * "in-core" inode structure are copied to the corresponding in-memory inode
12092  * block, the dependencies are also transferred.  The following procedure is
12093  * called when copying a dirty "in-core" inode to a cached inode block.
12094  */
12095
12096 /*
12097  * Called when an inode is loaded from disk. If the effective link count
12098  * differed from the actual link count when it was last flushed, then we
12099  * need to ensure that the correct effective link count is put back.
12100  */
12101 void
12102 softdep_load_inodeblock(ip)
12103         struct inode *ip;       /* the "in_core" copy of the inode */
12104 {
12105         struct inodedep *inodedep;
12106
12107         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12108             ("softdep_load_inodeblock called on non-softdep filesystem"));
12109         /*
12110          * Check for alternate nlink count.
12111          */
12112         ip->i_effnlink = ip->i_nlink;
12113         ACQUIRE_LOCK(ip->i_ump);
12114         if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
12115             &inodedep) == 0) {
12116                 FREE_LOCK(ip->i_ump);
12117                 return;
12118         }
12119         ip->i_effnlink -= inodedep->id_nlinkdelta;
12120         FREE_LOCK(ip->i_ump);
12121 }
12122
12123 /*
12124  * This routine is called just before the "in-core" inode
12125  * information is to be copied to the in-memory inode block.
12126  * Recall that an inode block contains several inodes. If
12127  * the force flag is set, then the dependencies will be
12128  * cleared so that the update can always be made. Note that
12129  * the buffer is locked when this routine is called, so we
12130  * will never be in the middle of writing the inode block
12131  * to disk.
12132  */
12133 void
12134 softdep_update_inodeblock(ip, bp, waitfor)
12135         struct inode *ip;       /* the "in_core" copy of the inode */
12136         struct buf *bp;         /* the buffer containing the inode block */
12137         int waitfor;            /* nonzero => update must be allowed */
12138 {
12139         struct inodedep *inodedep;
12140         struct inoref *inoref;
12141         struct ufsmount *ump;
12142         struct worklist *wk;
12143         struct mount *mp;
12144         struct buf *ibp;
12145         struct fs *fs;
12146         int error;
12147
12148         ump = ip->i_ump;
12149         mp = UFSTOVFS(ump);
12150         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12151             ("softdep_update_inodeblock called on non-softdep filesystem"));
12152         fs = ip->i_fs;
12153         /*
12154          * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12155          * does not have access to the in-core ip so must write directly into
12156          * the inode block buffer when setting freelink.
12157          */
12158         if (fs->fs_magic == FS_UFS1_MAGIC)
12159                 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12160                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
12161         else
12162                 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12163                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
12164         /*
12165          * If the effective link count is not equal to the actual link
12166          * count, then we must track the difference in an inodedep while
12167          * the inode is (potentially) tossed out of the cache. Otherwise,
12168          * if there is no existing inodedep, then there are no dependencies
12169          * to track.
12170          */
12171         ACQUIRE_LOCK(ump);
12172 again:
12173         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12174                 FREE_LOCK(ump);
12175                 if (ip->i_effnlink != ip->i_nlink)
12176                         panic("softdep_update_inodeblock: bad link count");
12177                 return;
12178         }
12179         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12180                 panic("softdep_update_inodeblock: bad delta");
12181         /*
12182          * If we're flushing all dependencies we must also move any waiting
12183          * for journal writes onto the bufwait list prior to I/O.
12184          */
12185         if (waitfor) {
12186                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12187                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12188                             == DEPCOMPLETE) {
12189                                 jwait(&inoref->if_list, MNT_WAIT);
12190                                 goto again;
12191                         }
12192                 }
12193         }
12194         /*
12195          * Changes have been initiated. Anything depending on these
12196          * changes cannot occur until this inode has been written.
12197          */
12198         inodedep->id_state &= ~COMPLETE;
12199         if ((inodedep->id_state & ONWORKLIST) == 0)
12200                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12201         /*
12202          * Any new dependencies associated with the incore inode must
12203          * now be moved to the list associated with the buffer holding
12204          * the in-memory copy of the inode. Once merged process any
12205          * allocdirects that are completed by the merger.
12206          */
12207         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12208         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12209                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12210                     NULL);
12211         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12212         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12213                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12214                     NULL);
12215         /*
12216          * Now that the inode has been pushed into the buffer, the
12217          * operations dependent on the inode being written to disk
12218          * can be moved to the id_bufwait so that they will be
12219          * processed when the buffer I/O completes.
12220          */
12221         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12222                 WORKLIST_REMOVE(wk);
12223                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12224         }
12225         /*
12226          * Newly allocated inodes cannot be written until the bitmap
12227          * that allocates them have been written (indicated by
12228          * DEPCOMPLETE being set in id_state). If we are doing a
12229          * forced sync (e.g., an fsync on a file), we force the bitmap
12230          * to be written so that the update can be done.
12231          */
12232         if (waitfor == 0) {
12233                 FREE_LOCK(ump);
12234                 return;
12235         }
12236 retry:
12237         if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12238                 FREE_LOCK(ump);
12239                 return;
12240         }
12241         ibp = inodedep->id_bmsafemap->sm_buf;
12242         ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12243         if (ibp == NULL) {
12244                 /*
12245                  * If ibp came back as NULL, the dependency could have been
12246                  * freed while we slept.  Look it up again, and check to see
12247                  * that it has completed.
12248                  */
12249                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12250                         goto retry;
12251                 FREE_LOCK(ump);
12252                 return;
12253         }
12254         FREE_LOCK(ump);
12255         if ((error = bwrite(ibp)) != 0)
12256                 softdep_error("softdep_update_inodeblock: bwrite", error);
12257 }
12258
12259 /*
12260  * Merge the a new inode dependency list (such as id_newinoupdt) into an
12261  * old inode dependency list (such as id_inoupdt). This routine must be
12262  * called with splbio interrupts blocked.
12263  */
12264 static void
12265 merge_inode_lists(newlisthead, oldlisthead)
12266         struct allocdirectlst *newlisthead;
12267         struct allocdirectlst *oldlisthead;
12268 {
12269         struct allocdirect *listadp, *newadp;
12270
12271         newadp = TAILQ_FIRST(newlisthead);
12272         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12273                 if (listadp->ad_offset < newadp->ad_offset) {
12274                         listadp = TAILQ_NEXT(listadp, ad_next);
12275                         continue;
12276                 }
12277                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12278                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12279                 if (listadp->ad_offset == newadp->ad_offset) {
12280                         allocdirect_merge(oldlisthead, newadp,
12281                             listadp);
12282                         listadp = newadp;
12283                 }
12284                 newadp = TAILQ_FIRST(newlisthead);
12285         }
12286         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12287                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12288                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12289         }
12290 }
12291
12292 /*
12293  * If we are doing an fsync, then we must ensure that any directory
12294  * entries for the inode have been written after the inode gets to disk.
12295  */
12296 int
12297 softdep_fsync(vp)
12298         struct vnode *vp;       /* the "in_core" copy of the inode */
12299 {
12300         struct inodedep *inodedep;
12301         struct pagedep *pagedep;
12302         struct inoref *inoref;
12303         struct ufsmount *ump;
12304         struct worklist *wk;
12305         struct diradd *dap;
12306         struct mount *mp;
12307         struct vnode *pvp;
12308         struct inode *ip;
12309         struct buf *bp;
12310         struct fs *fs;
12311         struct thread *td = curthread;
12312         int error, flushparent, pagedep_new_block;
12313         ino_t parentino;
12314         ufs_lbn_t lbn;
12315
12316         ip = VTOI(vp);
12317         fs = ip->i_fs;
12318         ump = ip->i_ump;
12319         mp = vp->v_mount;
12320         if (MOUNTEDSOFTDEP(mp) == 0)
12321                 return (0);
12322         ACQUIRE_LOCK(ump);
12323 restart:
12324         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12325                 FREE_LOCK(ump);
12326                 return (0);
12327         }
12328         TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12329                 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12330                     == DEPCOMPLETE) {
12331                         jwait(&inoref->if_list, MNT_WAIT);
12332                         goto restart;
12333                 }
12334         }
12335         if (!LIST_EMPTY(&inodedep->id_inowait) ||
12336             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12337             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12338             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12339             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12340                 panic("softdep_fsync: pending ops %p", inodedep);
12341         for (error = 0, flushparent = 0; ; ) {
12342                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12343                         break;
12344                 if (wk->wk_type != D_DIRADD)
12345                         panic("softdep_fsync: Unexpected type %s",
12346                             TYPENAME(wk->wk_type));
12347                 dap = WK_DIRADD(wk);
12348                 /*
12349                  * Flush our parent if this directory entry has a MKDIR_PARENT
12350                  * dependency or is contained in a newly allocated block.
12351                  */
12352                 if (dap->da_state & DIRCHG)
12353                         pagedep = dap->da_previous->dm_pagedep;
12354                 else
12355                         pagedep = dap->da_pagedep;
12356                 parentino = pagedep->pd_ino;
12357                 lbn = pagedep->pd_lbn;
12358                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12359                         panic("softdep_fsync: dirty");
12360                 if ((dap->da_state & MKDIR_PARENT) ||
12361                     (pagedep->pd_state & NEWBLOCK))
12362                         flushparent = 1;
12363                 else
12364                         flushparent = 0;
12365                 /*
12366                  * If we are being fsync'ed as part of vgone'ing this vnode,
12367                  * then we will not be able to release and recover the
12368                  * vnode below, so we just have to give up on writing its
12369                  * directory entry out. It will eventually be written, just
12370                  * not now, but then the user was not asking to have it
12371                  * written, so we are not breaking any promises.
12372                  */
12373                 if (vp->v_iflag & VI_DOOMED)
12374                         break;
12375                 /*
12376                  * We prevent deadlock by always fetching inodes from the
12377                  * root, moving down the directory tree. Thus, when fetching
12378                  * our parent directory, we first try to get the lock. If
12379                  * that fails, we must unlock ourselves before requesting
12380                  * the lock on our parent. See the comment in ufs_lookup
12381                  * for details on possible races.
12382                  */
12383                 FREE_LOCK(ump);
12384                 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12385                     FFSV_FORCEINSMQ)) {
12386                         error = vfs_busy(mp, MBF_NOWAIT);
12387                         if (error != 0) {
12388                                 vfs_ref(mp);
12389                                 VOP_UNLOCK(vp, 0);
12390                                 error = vfs_busy(mp, 0);
12391                                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12392                                 vfs_rel(mp);
12393                                 if (error != 0)
12394                                         return (ENOENT);
12395                                 if (vp->v_iflag & VI_DOOMED) {
12396                                         vfs_unbusy(mp);
12397                                         return (ENOENT);
12398                                 }
12399                         }
12400                         VOP_UNLOCK(vp, 0);
12401                         error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12402                             &pvp, FFSV_FORCEINSMQ);
12403                         vfs_unbusy(mp);
12404                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12405                         if (vp->v_iflag & VI_DOOMED) {
12406                                 if (error == 0)
12407                                         vput(pvp);
12408                                 error = ENOENT;
12409                         }
12410                         if (error != 0)
12411                                 return (error);
12412                 }
12413                 /*
12414                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12415                  * that are contained in direct blocks will be resolved by
12416                  * doing a ffs_update. Pagedeps contained in indirect blocks
12417                  * may require a complete sync'ing of the directory. So, we
12418                  * try the cheap and fast ffs_update first, and if that fails,
12419                  * then we do the slower ffs_syncvnode of the directory.
12420                  */
12421                 if (flushparent) {
12422                         int locked;
12423
12424                         if ((error = ffs_update(pvp, 1)) != 0) {
12425                                 vput(pvp);
12426                                 return (error);
12427                         }
12428                         ACQUIRE_LOCK(ump);
12429                         locked = 1;
12430                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12431                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12432                                         if (wk->wk_type != D_DIRADD)
12433                                                 panic("softdep_fsync: Unexpected type %s",
12434                                                       TYPENAME(wk->wk_type));
12435                                         dap = WK_DIRADD(wk);
12436                                         if (dap->da_state & DIRCHG)
12437                                                 pagedep = dap->da_previous->dm_pagedep;
12438                                         else
12439                                                 pagedep = dap->da_pagedep;
12440                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12441                                         FREE_LOCK(ump);
12442                                         locked = 0;
12443                                         if (pagedep_new_block && (error =
12444                                             ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12445                                                 vput(pvp);
12446                                                 return (error);
12447                                         }
12448                                 }
12449                         }
12450                         if (locked)
12451                                 FREE_LOCK(ump);
12452                 }
12453                 /*
12454                  * Flush directory page containing the inode's name.
12455                  */
12456                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12457                     &bp);
12458                 if (error == 0)
12459                         error = bwrite(bp);
12460                 else
12461                         brelse(bp);
12462                 vput(pvp);
12463                 if (error != 0)
12464                         return (error);
12465                 ACQUIRE_LOCK(ump);
12466                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12467                         break;
12468         }
12469         FREE_LOCK(ump);
12470         return (0);
12471 }
12472
12473 /*
12474  * Flush all the dirty bitmaps associated with the block device
12475  * before flushing the rest of the dirty blocks so as to reduce
12476  * the number of dependencies that will have to be rolled back.
12477  *
12478  * XXX Unused?
12479  */
12480 void
12481 softdep_fsync_mountdev(vp)
12482         struct vnode *vp;
12483 {
12484         struct buf *bp, *nbp;
12485         struct worklist *wk;
12486         struct bufobj *bo;
12487
12488         if (!vn_isdisk(vp, NULL))
12489                 panic("softdep_fsync_mountdev: vnode not a disk");
12490         bo = &vp->v_bufobj;
12491 restart:
12492         BO_LOCK(bo);
12493         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12494                 /*
12495                  * If it is already scheduled, skip to the next buffer.
12496                  */
12497                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12498                         continue;
12499
12500                 if ((bp->b_flags & B_DELWRI) == 0)
12501                         panic("softdep_fsync_mountdev: not dirty");
12502                 /*
12503                  * We are only interested in bitmaps with outstanding
12504                  * dependencies.
12505                  */
12506                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12507                     wk->wk_type != D_BMSAFEMAP ||
12508                     (bp->b_vflags & BV_BKGRDINPROG)) {
12509                         BUF_UNLOCK(bp);
12510                         continue;
12511                 }
12512                 BO_UNLOCK(bo);
12513                 bremfree(bp);
12514                 (void) bawrite(bp);
12515                 goto restart;
12516         }
12517         drain_output(vp);
12518         BO_UNLOCK(bo);
12519 }
12520
12521 /*
12522  * Sync all cylinder groups that were dirty at the time this function is
12523  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12524  * is used to flush freedep activity that may be holding up writes to a
12525  * indirect block.
12526  */
12527 static int
12528 sync_cgs(mp, waitfor)
12529         struct mount *mp;
12530         int waitfor;
12531 {
12532         struct bmsafemap *bmsafemap;
12533         struct bmsafemap *sentinel;
12534         struct ufsmount *ump;
12535         struct buf *bp;
12536         int error;
12537
12538         sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12539         sentinel->sm_cg = -1;
12540         ump = VFSTOUFS(mp);
12541         error = 0;
12542         ACQUIRE_LOCK(ump);
12543         LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12544         for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12545             bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12546                 /* Skip sentinels and cgs with no work to release. */
12547                 if (bmsafemap->sm_cg == -1 ||
12548                     (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12549                     LIST_EMPTY(&bmsafemap->sm_freewr))) {
12550                         LIST_REMOVE(sentinel, sm_next);
12551                         LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12552                         continue;
12553                 }
12554                 /*
12555                  * If we don't get the lock and we're waiting try again, if
12556                  * not move on to the next buf and try to sync it.
12557                  */
12558                 bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12559                 if (bp == NULL && waitfor == MNT_WAIT)
12560                         continue;
12561                 LIST_REMOVE(sentinel, sm_next);
12562                 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12563                 if (bp == NULL)
12564                         continue;
12565                 FREE_LOCK(ump);
12566                 if (waitfor == MNT_NOWAIT)
12567                         bawrite(bp);
12568                 else
12569                         error = bwrite(bp);
12570                 ACQUIRE_LOCK(ump);
12571                 if (error)
12572                         break;
12573         }
12574         LIST_REMOVE(sentinel, sm_next);
12575         FREE_LOCK(ump);
12576         free(sentinel, M_BMSAFEMAP);
12577         return (error);
12578 }
12579
12580 /*
12581  * This routine is called when we are trying to synchronously flush a
12582  * file. This routine must eliminate any filesystem metadata dependencies
12583  * so that the syncing routine can succeed.
12584  */
12585 int
12586 softdep_sync_metadata(struct vnode *vp)
12587 {
12588         struct inode *ip;
12589         int error;
12590
12591         ip = VTOI(vp);
12592         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12593             ("softdep_sync_metadata called on non-softdep filesystem"));
12594         /*
12595          * Ensure that any direct block dependencies have been cleared,
12596          * truncations are started, and inode references are journaled.
12597          */
12598         ACQUIRE_LOCK(ip->i_ump);
12599         /*
12600          * Write all journal records to prevent rollbacks on devvp.
12601          */
12602         if (vp->v_type == VCHR)
12603                 softdep_flushjournal(vp->v_mount);
12604         error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12605         /*
12606          * Ensure that all truncates are written so we won't find deps on
12607          * indirect blocks.
12608          */
12609         process_truncates(vp);
12610         FREE_LOCK(ip->i_ump);
12611
12612         return (error);
12613 }
12614
12615 /*
12616  * This routine is called when we are attempting to sync a buf with
12617  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12618  * other IO it can but returns EBUSY if the buffer is not yet able to
12619  * be written.  Dependencies which will not cause rollbacks will always
12620  * return 0.
12621  */
12622 int
12623 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12624 {
12625         struct indirdep *indirdep;
12626         struct pagedep *pagedep;
12627         struct allocindir *aip;
12628         struct newblk *newblk;
12629         struct ufsmount *ump;
12630         struct buf *nbp;
12631         struct worklist *wk;
12632         int i, error;
12633
12634         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12635             ("softdep_sync_buf called on non-softdep filesystem"));
12636         /*
12637          * For VCHR we just don't want to force flush any dependencies that
12638          * will cause rollbacks.
12639          */
12640         if (vp->v_type == VCHR) {
12641                 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12642                         return (EBUSY);
12643                 return (0);
12644         }
12645         ump = VTOI(vp)->i_ump;
12646         ACQUIRE_LOCK(ump);
12647         /*
12648          * As we hold the buffer locked, none of its dependencies
12649          * will disappear.
12650          */
12651         error = 0;
12652 top:
12653         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12654                 switch (wk->wk_type) {
12655
12656                 case D_ALLOCDIRECT:
12657                 case D_ALLOCINDIR:
12658                         newblk = WK_NEWBLK(wk);
12659                         if (newblk->nb_jnewblk != NULL) {
12660                                 if (waitfor == MNT_NOWAIT) {
12661                                         error = EBUSY;
12662                                         goto out_unlock;
12663                                 }
12664                                 jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12665                                 goto top;
12666                         }
12667                         if (newblk->nb_state & DEPCOMPLETE ||
12668                             waitfor == MNT_NOWAIT)
12669                                 continue;
12670                         nbp = newblk->nb_bmsafemap->sm_buf;
12671                         nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12672                         if (nbp == NULL)
12673                                 goto top;
12674                         FREE_LOCK(ump);
12675                         if ((error = bwrite(nbp)) != 0)
12676                                 goto out;
12677                         ACQUIRE_LOCK(ump);
12678                         continue;
12679
12680                 case D_INDIRDEP:
12681                         indirdep = WK_INDIRDEP(wk);
12682                         if (waitfor == MNT_NOWAIT) {
12683                                 if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12684                                     !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12685                                         error = EBUSY;
12686                                         goto out_unlock;
12687                                 }
12688                         }
12689                         if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12690                                 panic("softdep_sync_buf: truncation pending.");
12691                 restart:
12692                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12693                                 newblk = (struct newblk *)aip;
12694                                 if (newblk->nb_jnewblk != NULL) {
12695                                         jwait(&newblk->nb_jnewblk->jn_list,
12696                                             waitfor);
12697                                         goto restart;
12698                                 }
12699                                 if (newblk->nb_state & DEPCOMPLETE)
12700                                         continue;
12701                                 nbp = newblk->nb_bmsafemap->sm_buf;
12702                                 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12703                                 if (nbp == NULL)
12704                                         goto restart;
12705                                 FREE_LOCK(ump);
12706                                 if ((error = bwrite(nbp)) != 0)
12707                                         goto out;
12708                                 ACQUIRE_LOCK(ump);
12709                                 goto restart;
12710                         }
12711                         continue;
12712
12713                 case D_PAGEDEP:
12714                         /*
12715                          * Only flush directory entries in synchronous passes.
12716                          */
12717                         if (waitfor != MNT_WAIT) {
12718                                 error = EBUSY;
12719                                 goto out_unlock;
12720                         }
12721                         /*
12722                          * While syncing snapshots, we must allow recursive
12723                          * lookups.
12724                          */
12725                         BUF_AREC(bp);
12726                         /*
12727                          * We are trying to sync a directory that may
12728                          * have dependencies on both its own metadata
12729                          * and/or dependencies on the inodes of any
12730                          * recently allocated files. We walk its diradd
12731                          * lists pushing out the associated inode.
12732                          */
12733                         pagedep = WK_PAGEDEP(wk);
12734                         for (i = 0; i < DAHASHSZ; i++) {
12735                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12736                                         continue;
12737                                 if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12738                                     &pagedep->pd_diraddhd[i]))) {
12739                                         BUF_NOREC(bp);
12740                                         goto out_unlock;
12741                                 }
12742                         }
12743                         BUF_NOREC(bp);
12744                         continue;
12745
12746                 case D_FREEWORK:
12747                 case D_FREEDEP:
12748                 case D_JSEGDEP:
12749                 case D_JNEWBLK:
12750                         continue;
12751
12752                 default:
12753                         panic("softdep_sync_buf: Unknown type %s",
12754                             TYPENAME(wk->wk_type));
12755                         /* NOTREACHED */
12756                 }
12757         }
12758 out_unlock:
12759         FREE_LOCK(ump);
12760 out:
12761         return (error);
12762 }
12763
12764 /*
12765  * Flush the dependencies associated with an inodedep.
12766  * Called with splbio blocked.
12767  */
12768 static int
12769 flush_inodedep_deps(vp, mp, ino)
12770         struct vnode *vp;
12771         struct mount *mp;
12772         ino_t ino;
12773 {
12774         struct inodedep *inodedep;
12775         struct inoref *inoref;
12776         struct ufsmount *ump;
12777         int error, waitfor;
12778
12779         /*
12780          * This work is done in two passes. The first pass grabs most
12781          * of the buffers and begins asynchronously writing them. The
12782          * only way to wait for these asynchronous writes is to sleep
12783          * on the filesystem vnode which may stay busy for a long time
12784          * if the filesystem is active. So, instead, we make a second
12785          * pass over the dependencies blocking on each write. In the
12786          * usual case we will be blocking against a write that we
12787          * initiated, so when it is done the dependency will have been
12788          * resolved. Thus the second pass is expected to end quickly.
12789          * We give a brief window at the top of the loop to allow
12790          * any pending I/O to complete.
12791          */
12792         ump = VFSTOUFS(mp);
12793         LOCK_OWNED(ump);
12794         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12795                 if (error)
12796                         return (error);
12797                 FREE_LOCK(ump);
12798                 ACQUIRE_LOCK(ump);
12799 restart:
12800                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12801                         return (0);
12802                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12803                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12804                             == DEPCOMPLETE) {
12805                                 jwait(&inoref->if_list, MNT_WAIT);
12806                                 goto restart;
12807                         }
12808                 }
12809                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12810                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12811                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12812                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12813                         continue;
12814                 /*
12815                  * If pass2, we are done, otherwise do pass 2.
12816                  */
12817                 if (waitfor == MNT_WAIT)
12818                         break;
12819                 waitfor = MNT_WAIT;
12820         }
12821         /*
12822          * Try freeing inodedep in case all dependencies have been removed.
12823          */
12824         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12825                 (void) free_inodedep(inodedep);
12826         return (0);
12827 }
12828
12829 /*
12830  * Flush an inode dependency list.
12831  * Called with splbio blocked.
12832  */
12833 static int
12834 flush_deplist(listhead, waitfor, errorp)
12835         struct allocdirectlst *listhead;
12836         int waitfor;
12837         int *errorp;
12838 {
12839         struct allocdirect *adp;
12840         struct newblk *newblk;
12841         struct ufsmount *ump;
12842         struct buf *bp;
12843
12844         if ((adp = TAILQ_FIRST(listhead)) == NULL)
12845                 return (0);
12846         ump = VFSTOUFS(adp->ad_list.wk_mp);
12847         LOCK_OWNED(ump);
12848         TAILQ_FOREACH(adp, listhead, ad_next) {
12849                 newblk = (struct newblk *)adp;
12850                 if (newblk->nb_jnewblk != NULL) {
12851                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12852                         return (1);
12853                 }
12854                 if (newblk->nb_state & DEPCOMPLETE)
12855                         continue;
12856                 bp = newblk->nb_bmsafemap->sm_buf;
12857                 bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12858                 if (bp == NULL) {
12859                         if (waitfor == MNT_NOWAIT)
12860                                 continue;
12861                         return (1);
12862                 }
12863                 FREE_LOCK(ump);
12864                 if (waitfor == MNT_NOWAIT)
12865                         bawrite(bp);
12866                 else
12867                         *errorp = bwrite(bp);
12868                 ACQUIRE_LOCK(ump);
12869                 return (1);
12870         }
12871         return (0);
12872 }
12873
12874 /*
12875  * Flush dependencies associated with an allocdirect block.
12876  */
12877 static int
12878 flush_newblk_dep(vp, mp, lbn)
12879         struct vnode *vp;
12880         struct mount *mp;
12881         ufs_lbn_t lbn;
12882 {
12883         struct newblk *newblk;
12884         struct ufsmount *ump;
12885         struct bufobj *bo;
12886         struct inode *ip;
12887         struct buf *bp;
12888         ufs2_daddr_t blkno;
12889         int error;
12890
12891         error = 0;
12892         bo = &vp->v_bufobj;
12893         ip = VTOI(vp);
12894         blkno = DIP(ip, i_db[lbn]);
12895         if (blkno == 0)
12896                 panic("flush_newblk_dep: Missing block");
12897         ump = VFSTOUFS(mp);
12898         ACQUIRE_LOCK(ump);
12899         /*
12900          * Loop until all dependencies related to this block are satisfied.
12901          * We must be careful to restart after each sleep in case a write
12902          * completes some part of this process for us.
12903          */
12904         for (;;) {
12905                 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12906                         FREE_LOCK(ump);
12907                         break;
12908                 }
12909                 if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12910                         panic("flush_newblk_deps: Bad newblk %p", newblk);
12911                 /*
12912                  * Flush the journal.
12913                  */
12914                 if (newblk->nb_jnewblk != NULL) {
12915                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12916                         continue;
12917                 }
12918                 /*
12919                  * Write the bitmap dependency.
12920                  */
12921                 if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12922                         bp = newblk->nb_bmsafemap->sm_buf;
12923                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12924                         if (bp == NULL)
12925                                 continue;
12926                         FREE_LOCK(ump);
12927                         error = bwrite(bp);
12928                         if (error)
12929                                 break;
12930                         ACQUIRE_LOCK(ump);
12931                         continue;
12932                 }
12933                 /*
12934                  * Write the buffer.
12935                  */
12936                 FREE_LOCK(ump);
12937                 BO_LOCK(bo);
12938                 bp = gbincore(bo, lbn);
12939                 if (bp != NULL) {
12940                         error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12941                             LK_INTERLOCK, BO_LOCKPTR(bo));
12942                         if (error == ENOLCK) {
12943                                 ACQUIRE_LOCK(ump);
12944                                 error = 0;
12945                                 continue; /* Slept, retry */
12946                         }
12947                         if (error != 0)
12948                                 break;  /* Failed */
12949                         if (bp->b_flags & B_DELWRI) {
12950                                 bremfree(bp);
12951                                 error = bwrite(bp);
12952                                 if (error)
12953                                         break;
12954                         } else
12955                                 BUF_UNLOCK(bp);
12956                 } else
12957                         BO_UNLOCK(bo);
12958                 /*
12959                  * We have to wait for the direct pointers to
12960                  * point at the newdirblk before the dependency
12961                  * will go away.
12962                  */
12963                 error = ffs_update(vp, 1);
12964                 if (error)
12965                         break;
12966                 ACQUIRE_LOCK(ump);
12967         }
12968         return (error);
12969 }
12970
12971 /*
12972  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12973  * Called with splbio blocked.
12974  */
12975 static int
12976 flush_pagedep_deps(pvp, mp, diraddhdp)
12977         struct vnode *pvp;
12978         struct mount *mp;
12979         struct diraddhd *diraddhdp;
12980 {
12981         struct inodedep *inodedep;
12982         struct inoref *inoref;
12983         struct ufsmount *ump;
12984         struct diradd *dap;
12985         struct vnode *vp;
12986         int error = 0;
12987         struct buf *bp;
12988         ino_t inum;
12989         struct diraddhd unfinished;
12990
12991         LIST_INIT(&unfinished);
12992         ump = VFSTOUFS(mp);
12993         LOCK_OWNED(ump);
12994 restart:
12995         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12996                 /*
12997                  * Flush ourselves if this directory entry
12998                  * has a MKDIR_PARENT dependency.
12999                  */
13000                 if (dap->da_state & MKDIR_PARENT) {
13001                         FREE_LOCK(ump);
13002                         if ((error = ffs_update(pvp, 1)) != 0)
13003                                 break;
13004                         ACQUIRE_LOCK(ump);
13005                         /*
13006                          * If that cleared dependencies, go on to next.
13007                          */
13008                         if (dap != LIST_FIRST(diraddhdp))
13009                                 continue;
13010                         /*
13011                          * All MKDIR_PARENT dependencies and all the
13012                          * NEWBLOCK pagedeps that are contained in direct
13013                          * blocks were resolved by doing above ffs_update.
13014                          * Pagedeps contained in indirect blocks may
13015                          * require a complete sync'ing of the directory.
13016                          * We are in the midst of doing a complete sync,
13017                          * so if they are not resolved in this pass we
13018                          * defer them for now as they will be sync'ed by
13019                          * our caller shortly.
13020                          */
13021                         LIST_REMOVE(dap, da_pdlist);
13022                         LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13023                         continue;
13024                 }
13025                 /*
13026                  * A newly allocated directory must have its "." and
13027                  * ".." entries written out before its name can be
13028                  * committed in its parent.
13029                  */
13030                 inum = dap->da_newinum;
13031                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13032                         panic("flush_pagedep_deps: lost inode1");
13033                 /*
13034                  * Wait for any pending journal adds to complete so we don't
13035                  * cause rollbacks while syncing.
13036                  */
13037                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13038                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13039                             == DEPCOMPLETE) {
13040                                 jwait(&inoref->if_list, MNT_WAIT);
13041                                 goto restart;
13042                         }
13043                 }
13044                 if (dap->da_state & MKDIR_BODY) {
13045                         FREE_LOCK(ump);
13046                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13047                             FFSV_FORCEINSMQ)))
13048                                 break;
13049                         error = flush_newblk_dep(vp, mp, 0);
13050                         /*
13051                          * If we still have the dependency we might need to
13052                          * update the vnode to sync the new link count to
13053                          * disk.
13054                          */
13055                         if (error == 0 && dap == LIST_FIRST(diraddhdp))
13056                                 error = ffs_update(vp, 1);
13057                         vput(vp);
13058                         if (error != 0)
13059                                 break;
13060                         ACQUIRE_LOCK(ump);
13061                         /*
13062                          * If that cleared dependencies, go on to next.
13063                          */
13064                         if (dap != LIST_FIRST(diraddhdp))
13065                                 continue;
13066                         if (dap->da_state & MKDIR_BODY) {
13067                                 inodedep_lookup(UFSTOVFS(ump), inum, 0,
13068                                     &inodedep);
13069                                 panic("flush_pagedep_deps: MKDIR_BODY "
13070                                     "inodedep %p dap %p vp %p",
13071                                     inodedep, dap, vp);
13072                         }
13073                 }
13074                 /*
13075                  * Flush the inode on which the directory entry depends.
13076                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13077                  * the only remaining dependency is that the updated inode
13078                  * count must get pushed to disk. The inode has already
13079                  * been pushed into its inode buffer (via VOP_UPDATE) at
13080                  * the time of the reference count change. So we need only
13081                  * locate that buffer, ensure that there will be no rollback
13082                  * caused by a bitmap dependency, then write the inode buffer.
13083                  */
13084 retry:
13085                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13086                         panic("flush_pagedep_deps: lost inode");
13087                 /*
13088                  * If the inode still has bitmap dependencies,
13089                  * push them to disk.
13090                  */
13091                 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13092                         bp = inodedep->id_bmsafemap->sm_buf;
13093                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13094                         if (bp == NULL)
13095                                 goto retry;
13096                         FREE_LOCK(ump);
13097                         if ((error = bwrite(bp)) != 0)
13098                                 break;
13099                         ACQUIRE_LOCK(ump);
13100                         if (dap != LIST_FIRST(diraddhdp))
13101                                 continue;
13102                 }
13103                 /*
13104                  * If the inode is still sitting in a buffer waiting
13105                  * to be written or waiting for the link count to be
13106                  * adjusted update it here to flush it to disk.
13107                  */
13108                 if (dap == LIST_FIRST(diraddhdp)) {
13109                         FREE_LOCK(ump);
13110                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13111                             FFSV_FORCEINSMQ)))
13112                                 break;
13113                         error = ffs_update(vp, 1);
13114                         vput(vp);
13115                         if (error)
13116                                 break;
13117                         ACQUIRE_LOCK(ump);
13118                 }
13119                 /*
13120                  * If we have failed to get rid of all the dependencies
13121                  * then something is seriously wrong.
13122                  */
13123                 if (dap == LIST_FIRST(diraddhdp)) {
13124                         inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13125                         panic("flush_pagedep_deps: failed to flush "
13126                             "inodedep %p ino %ju dap %p",
13127                             inodedep, (uintmax_t)inum, dap);
13128                 }
13129         }
13130         if (error)
13131                 ACQUIRE_LOCK(ump);
13132         while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13133                 LIST_REMOVE(dap, da_pdlist);
13134                 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13135         }
13136         return (error);
13137 }
13138
13139 /*
13140  * A large burst of file addition or deletion activity can drive the
13141  * memory load excessively high. First attempt to slow things down
13142  * using the techniques below. If that fails, this routine requests
13143  * the offending operations to fall back to running synchronously
13144  * until the memory load returns to a reasonable level.
13145  */
13146 int
13147 softdep_slowdown(vp)
13148         struct vnode *vp;
13149 {
13150         struct ufsmount *ump;
13151         int jlow;
13152         int max_softdeps_hard;
13153
13154         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13155             ("softdep_slowdown called on non-softdep filesystem"));
13156         ump = VFSTOUFS(vp->v_mount);
13157         ACQUIRE_LOCK(ump);
13158         jlow = 0;
13159         /*
13160          * Check for journal space if needed.
13161          */
13162         if (DOINGSUJ(vp)) {
13163                 if (journal_space(ump, 0) == 0)
13164                         jlow = 1;
13165         }
13166         /*
13167          * If the system is under its limits and our filesystem is
13168          * not responsible for more than our share of the usage and
13169          * we are not low on journal space, then no need to slow down.
13170          */
13171         max_softdeps_hard = max_softdeps * 11 / 10;
13172         if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13173             dep_current[D_INODEDEP] < max_softdeps_hard &&
13174             dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13175             dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13176             ump->softdep_curdeps[D_DIRREM] <
13177             (max_softdeps_hard / 2) / stat_flush_threads &&
13178             ump->softdep_curdeps[D_INODEDEP] <
13179             max_softdeps_hard / stat_flush_threads &&
13180             ump->softdep_curdeps[D_INDIRDEP] <
13181             (max_softdeps_hard / 1000) / stat_flush_threads &&
13182             ump->softdep_curdeps[D_FREEBLKS] <
13183             max_softdeps_hard / stat_flush_threads) {
13184                 FREE_LOCK(ump);
13185                 return (0);
13186         }
13187         /*
13188          * If the journal is low or our filesystem is over its limit
13189          * then speedup the cleanup.
13190          */
13191         if (ump->softdep_curdeps[D_INDIRDEP] <
13192             (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13193                 softdep_speedup(ump);
13194         stat_sync_limit_hit += 1;
13195         FREE_LOCK(ump);
13196         /*
13197          * We only slow down the rate at which new dependencies are
13198          * generated if we are not using journaling. With journaling,
13199          * the cleanup should always be sufficient to keep things
13200          * under control.
13201          */
13202         if (DOINGSUJ(vp))
13203                 return (0);
13204         return (1);
13205 }
13206
13207 /*
13208  * Called by the allocation routines when they are about to fail
13209  * in the hope that we can free up the requested resource (inodes
13210  * or disk space).
13211  *
13212  * First check to see if the work list has anything on it. If it has,
13213  * clean up entries until we successfully free the requested resource.
13214  * Because this process holds inodes locked, we cannot handle any remove
13215  * requests that might block on a locked inode as that could lead to
13216  * deadlock. If the worklist yields none of the requested resource,
13217  * start syncing out vnodes to free up the needed space.
13218  */
13219 int
13220 softdep_request_cleanup(fs, vp, cred, resource)
13221         struct fs *fs;
13222         struct vnode *vp;
13223         struct ucred *cred;
13224         int resource;
13225 {
13226         struct ufsmount *ump;
13227         struct mount *mp;
13228         struct vnode *lvp, *mvp;
13229         long starttime;
13230         ufs2_daddr_t needed;
13231         int error;
13232
13233         /*
13234          * If we are being called because of a process doing a
13235          * copy-on-write, then it is not safe to process any
13236          * worklist items as we will recurse into the copyonwrite
13237          * routine.  This will result in an incoherent snapshot.
13238          * If the vnode that we hold is a snapshot, we must avoid
13239          * handling other resources that could cause deadlock.
13240          */
13241         if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13242                 return (0);
13243
13244         if (resource == FLUSH_BLOCKS_WAIT)
13245                 stat_cleanup_blkrequests += 1;
13246         else
13247                 stat_cleanup_inorequests += 1;
13248
13249         mp = vp->v_mount;
13250         ump = VFSTOUFS(mp);
13251         mtx_assert(UFS_MTX(ump), MA_OWNED);
13252         UFS_UNLOCK(ump);
13253         error = ffs_update(vp, 1);
13254         if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13255                 UFS_LOCK(ump);
13256                 return (0);
13257         }
13258         /*
13259          * If we are in need of resources, start by cleaning up
13260          * any block removals associated with our inode.
13261          */
13262         ACQUIRE_LOCK(ump);
13263         process_removes(vp);
13264         process_truncates(vp);
13265         FREE_LOCK(ump);
13266         /*
13267          * Now clean up at least as many resources as we will need.
13268          *
13269          * When requested to clean up inodes, the number that are needed
13270          * is set by the number of simultaneous writers (mnt_writeopcount)
13271          * plus a bit of slop (2) in case some more writers show up while
13272          * we are cleaning.
13273          *
13274          * When requested to free up space, the amount of space that
13275          * we need is enough blocks to allocate a full-sized segment
13276          * (fs_contigsumsize). The number of such segments that will
13277          * be needed is set by the number of simultaneous writers
13278          * (mnt_writeopcount) plus a bit of slop (2) in case some more
13279          * writers show up while we are cleaning.
13280          *
13281          * Additionally, if we are unpriviledged and allocating space,
13282          * we need to ensure that we clean up enough blocks to get the
13283          * needed number of blocks over the threshhold of the minimum
13284          * number of blocks required to be kept free by the filesystem
13285          * (fs_minfree).
13286          */
13287         if (resource == FLUSH_INODES_WAIT) {
13288                 needed = vp->v_mount->mnt_writeopcount + 2;
13289         } else if (resource == FLUSH_BLOCKS_WAIT) {
13290                 needed = (vp->v_mount->mnt_writeopcount + 2) *
13291                     fs->fs_contigsumsize;
13292                 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13293                         needed += fragstoblks(fs,
13294                             roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13295                             fs->fs_cstotal.cs_nffree, fs->fs_frag));
13296         } else {
13297                 UFS_LOCK(ump);
13298                 printf("softdep_request_cleanup: Unknown resource type %d\n",
13299                     resource);
13300                 return (0);
13301         }
13302         starttime = time_second;
13303 retry:
13304         if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13305             fs->fs_cstotal.cs_nbfree <= needed) ||
13306             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13307             fs->fs_cstotal.cs_nifree <= needed)) {
13308                 ACQUIRE_LOCK(ump);
13309                 if (ump->softdep_on_worklist > 0 &&
13310                     process_worklist_item(UFSTOVFS(ump),
13311                     ump->softdep_on_worklist, LK_NOWAIT) != 0)
13312                         stat_worklist_push += 1;
13313                 FREE_LOCK(ump);
13314         }
13315         /*
13316          * If we still need resources and there are no more worklist
13317          * entries to process to obtain them, we have to start flushing
13318          * the dirty vnodes to force the release of additional requests
13319          * to the worklist that we can then process to reap addition
13320          * resources. We walk the vnodes associated with the mount point
13321          * until we get the needed worklist requests that we can reap.
13322          */
13323         if ((resource == FLUSH_BLOCKS_WAIT &&
13324              fs->fs_cstotal.cs_nbfree <= needed) ||
13325             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13326              fs->fs_cstotal.cs_nifree <= needed)) {
13327                 MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13328                         if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13329                                 VI_UNLOCK(lvp);
13330                                 continue;
13331                         }
13332                         if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13333                             curthread))
13334                                 continue;
13335                         if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
13336                                 vput(lvp);
13337                                 continue;
13338                         }
13339                         (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13340                         vput(lvp);
13341                 }
13342                 lvp = ump->um_devvp;
13343                 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13344                         VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
13345                         VOP_UNLOCK(lvp, 0);
13346                 }
13347                 if (ump->softdep_on_worklist > 0) {
13348                         stat_cleanup_retries += 1;
13349                         goto retry;
13350                 }
13351                 stat_cleanup_failures += 1;
13352         }
13353         if (time_second - starttime > stat_cleanup_high_delay)
13354                 stat_cleanup_high_delay = time_second - starttime;
13355         UFS_LOCK(ump);
13356         return (1);
13357 }
13358
13359 static bool
13360 softdep_excess_items(struct ufsmount *ump, int item)
13361 {
13362
13363         KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13364         return (dep_current[item] > max_softdeps &&
13365             ump->softdep_curdeps[item] > max_softdeps /
13366             stat_flush_threads);
13367 }
13368
13369 static void
13370 schedule_cleanup(struct mount *mp)
13371 {
13372         struct ufsmount *ump;
13373         struct thread *td;
13374
13375         ump = VFSTOUFS(mp);
13376         LOCK_OWNED(ump);
13377         FREE_LOCK(ump);
13378         td = curthread;
13379         if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13380             (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13381                 /*
13382                  * No ast is delivered to kernel threads, so nobody
13383                  * would deref the mp.  Some kernel threads
13384                  * explicitely check for AST, e.g. NFS daemon does
13385                  * this in the serving loop.
13386                  */
13387                 return;
13388         }
13389         if (td->td_su != NULL)
13390                 vfs_rel(td->td_su);
13391         vfs_ref(mp);
13392         td->td_su = mp;
13393         thread_lock(td);
13394         td->td_flags |= TDF_ASTPENDING;
13395         thread_unlock(td);
13396 }
13397
13398 static void
13399 softdep_ast_cleanup_proc(void)
13400 {
13401         struct thread *td;
13402         struct mount *mp;
13403         struct ufsmount *ump;
13404         int error;
13405         bool req;
13406
13407         td = curthread;
13408         while ((mp = td->td_su) != NULL) {
13409                 td->td_su = NULL;
13410                 error = vfs_busy(mp, MBF_NOWAIT);
13411                 vfs_rel(mp);
13412                 if (error != 0)
13413                         return;
13414                 if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13415                         ump = VFSTOUFS(mp);
13416                         for (;;) {
13417                                 req = false;
13418                                 ACQUIRE_LOCK(ump);
13419                                 if (softdep_excess_items(ump, D_INODEDEP)) {
13420                                         req = true;
13421                                         request_cleanup(mp, FLUSH_INODES);
13422                                 }
13423                                 if (softdep_excess_items(ump, D_DIRREM)) {
13424                                         req = true;
13425                                         request_cleanup(mp, FLUSH_BLOCKS);
13426                                 }
13427                                 FREE_LOCK(ump);
13428                                 if (softdep_excess_items(ump, D_NEWBLK) ||
13429                                     softdep_excess_items(ump, D_ALLOCDIRECT) ||
13430                                     softdep_excess_items(ump, D_ALLOCINDIR)) {
13431                                         error = vn_start_write(NULL, &mp,
13432                                             V_WAIT);
13433                                         if (error == 0) {
13434                                                 req = true;
13435                                                 VFS_SYNC(mp, MNT_WAIT);
13436                                                 vn_finished_write(mp);
13437                                         }
13438                                 }
13439                                 if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13440                                         break;
13441                         }
13442                 }
13443                 vfs_unbusy(mp);
13444         }
13445 }
13446
13447 /*
13448  * If memory utilization has gotten too high, deliberately slow things
13449  * down and speed up the I/O processing.
13450  */
13451 static int
13452 request_cleanup(mp, resource)
13453         struct mount *mp;
13454         int resource;
13455 {
13456         struct thread *td = curthread;
13457         struct ufsmount *ump;
13458
13459         ump = VFSTOUFS(mp);
13460         LOCK_OWNED(ump);
13461         /*
13462          * We never hold up the filesystem syncer or buf daemon.
13463          */
13464         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13465                 return (0);
13466         /*
13467          * First check to see if the work list has gotten backlogged.
13468          * If it has, co-opt this process to help clean up two entries.
13469          * Because this process may hold inodes locked, we cannot
13470          * handle any remove requests that might block on a locked
13471          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13472          * to avoid recursively processing the worklist.
13473          */
13474         if (ump->softdep_on_worklist > max_softdeps / 10) {
13475                 td->td_pflags |= TDP_SOFTDEP;
13476                 process_worklist_item(mp, 2, LK_NOWAIT);
13477                 td->td_pflags &= ~TDP_SOFTDEP;
13478                 stat_worklist_push += 2;
13479                 return(1);
13480         }
13481         /*
13482          * Next, we attempt to speed up the syncer process. If that
13483          * is successful, then we allow the process to continue.
13484          */
13485         if (softdep_speedup(ump) &&
13486             resource != FLUSH_BLOCKS_WAIT &&
13487             resource != FLUSH_INODES_WAIT)
13488                 return(0);
13489         /*
13490          * If we are resource constrained on inode dependencies, try
13491          * flushing some dirty inodes. Otherwise, we are constrained
13492          * by file deletions, so try accelerating flushes of directories
13493          * with removal dependencies. We would like to do the cleanup
13494          * here, but we probably hold an inode locked at this point and
13495          * that might deadlock against one that we try to clean. So,
13496          * the best that we can do is request the syncer daemon to do
13497          * the cleanup for us.
13498          */
13499         switch (resource) {
13500
13501         case FLUSH_INODES:
13502         case FLUSH_INODES_WAIT:
13503                 ACQUIRE_GBLLOCK(&lk);
13504                 stat_ino_limit_push += 1;
13505                 req_clear_inodedeps += 1;
13506                 FREE_GBLLOCK(&lk);
13507                 stat_countp = &stat_ino_limit_hit;
13508                 break;
13509
13510         case FLUSH_BLOCKS:
13511         case FLUSH_BLOCKS_WAIT:
13512                 ACQUIRE_GBLLOCK(&lk);
13513                 stat_blk_limit_push += 1;
13514                 req_clear_remove += 1;
13515                 FREE_GBLLOCK(&lk);
13516                 stat_countp = &stat_blk_limit_hit;
13517                 break;
13518
13519         default:
13520                 panic("request_cleanup: unknown type");
13521         }
13522         /*
13523          * Hopefully the syncer daemon will catch up and awaken us.
13524          * We wait at most tickdelay before proceeding in any case.
13525          */
13526         ACQUIRE_GBLLOCK(&lk);
13527         FREE_LOCK(ump);
13528         proc_waiting += 1;
13529         if (callout_pending(&softdep_callout) == FALSE)
13530                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13531                     pause_timer, 0);
13532
13533         if ((td->td_pflags & TDP_KTHREAD) == 0)
13534                 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13535         proc_waiting -= 1;
13536         FREE_GBLLOCK(&lk);
13537         ACQUIRE_LOCK(ump);
13538         return (1);
13539 }
13540
13541 /*
13542  * Awaken processes pausing in request_cleanup and clear proc_waiting
13543  * to indicate that there is no longer a timer running. Pause_timer
13544  * will be called with the global softdep mutex (&lk) locked.
13545  */
13546 static void
13547 pause_timer(arg)
13548         void *arg;
13549 {
13550
13551         GBLLOCK_OWNED(&lk);
13552         /*
13553          * The callout_ API has acquired mtx and will hold it around this
13554          * function call.
13555          */
13556         *stat_countp += proc_waiting;
13557         wakeup(&proc_waiting);
13558 }
13559
13560 /*
13561  * If requested, try removing inode or removal dependencies.
13562  */
13563 static void
13564 check_clear_deps(mp)
13565         struct mount *mp;
13566 {
13567
13568         /*
13569          * If we are suspended, it may be because of our using
13570          * too many inodedeps, so help clear them out.
13571          */
13572         if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13573                 clear_inodedeps(mp);
13574         /*
13575          * General requests for cleanup of backed up dependencies
13576          */
13577         ACQUIRE_GBLLOCK(&lk);
13578         if (req_clear_inodedeps) {
13579                 req_clear_inodedeps -= 1;
13580                 FREE_GBLLOCK(&lk);
13581                 clear_inodedeps(mp);
13582                 ACQUIRE_GBLLOCK(&lk);
13583                 wakeup(&proc_waiting);
13584         }
13585         if (req_clear_remove) {
13586                 req_clear_remove -= 1;
13587                 FREE_GBLLOCK(&lk);
13588                 clear_remove(mp);
13589                 ACQUIRE_GBLLOCK(&lk);
13590                 wakeup(&proc_waiting);
13591         }
13592         FREE_GBLLOCK(&lk);
13593 }
13594
13595 /*
13596  * Flush out a directory with at least one removal dependency in an effort to
13597  * reduce the number of dirrem, freefile, and freeblks dependency structures.
13598  */
13599 static void
13600 clear_remove(mp)
13601         struct mount *mp;
13602 {
13603         struct pagedep_hashhead *pagedephd;
13604         struct pagedep *pagedep;
13605         struct ufsmount *ump;
13606         struct vnode *vp;
13607         struct bufobj *bo;
13608         int error, cnt;
13609         ino_t ino;
13610
13611         ump = VFSTOUFS(mp);
13612         LOCK_OWNED(ump);
13613
13614         for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13615                 pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13616                 if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13617                         ump->pagedep_nextclean = 0;
13618                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13619                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
13620                                 continue;
13621                         ino = pagedep->pd_ino;
13622                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13623                                 continue;
13624                         FREE_LOCK(ump);
13625
13626                         /*
13627                          * Let unmount clear deps
13628                          */
13629                         error = vfs_busy(mp, MBF_NOWAIT);
13630                         if (error != 0)
13631                                 goto finish_write;
13632                         error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13633                              FFSV_FORCEINSMQ);
13634                         vfs_unbusy(mp);
13635                         if (error != 0) {
13636                                 softdep_error("clear_remove: vget", error);
13637                                 goto finish_write;
13638                         }
13639                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13640                                 softdep_error("clear_remove: fsync", error);
13641                         bo = &vp->v_bufobj;
13642                         BO_LOCK(bo);
13643                         drain_output(vp);
13644                         BO_UNLOCK(bo);
13645                         vput(vp);
13646                 finish_write:
13647                         vn_finished_write(mp);
13648                         ACQUIRE_LOCK(ump);
13649                         return;
13650                 }
13651         }
13652 }
13653
13654 /*
13655  * Clear out a block of dirty inodes in an effort to reduce
13656  * the number of inodedep dependency structures.
13657  */
13658 static void
13659 clear_inodedeps(mp)
13660         struct mount *mp;
13661 {
13662         struct inodedep_hashhead *inodedephd;
13663         struct inodedep *inodedep;
13664         struct ufsmount *ump;
13665         struct vnode *vp;
13666         struct fs *fs;
13667         int error, cnt;
13668         ino_t firstino, lastino, ino;
13669
13670         ump = VFSTOUFS(mp);
13671         fs = ump->um_fs;
13672         LOCK_OWNED(ump);
13673         /*
13674          * Pick a random inode dependency to be cleared.
13675          * We will then gather up all the inodes in its block
13676          * that have dependencies and flush them out.
13677          */
13678         for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13679                 inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13680                 if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13681                         ump->inodedep_nextclean = 0;
13682                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13683                         break;
13684         }
13685         if (inodedep == NULL)
13686                 return;
13687         /*
13688          * Find the last inode in the block with dependencies.
13689          */
13690         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13691         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13692                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13693                         break;
13694         /*
13695          * Asynchronously push all but the last inode with dependencies.
13696          * Synchronously push the last inode with dependencies to ensure
13697          * that the inode block gets written to free up the inodedeps.
13698          */
13699         for (ino = firstino; ino <= lastino; ino++) {
13700                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13701                         continue;
13702                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13703                         continue;
13704                 FREE_LOCK(ump);
13705                 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13706                 if (error != 0) {
13707                         vn_finished_write(mp);
13708                         ACQUIRE_LOCK(ump);
13709                         return;
13710                 }
13711                 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13712                     FFSV_FORCEINSMQ)) != 0) {
13713                         softdep_error("clear_inodedeps: vget", error);
13714                         vfs_unbusy(mp);
13715                         vn_finished_write(mp);
13716                         ACQUIRE_LOCK(ump);
13717                         return;
13718                 }
13719                 vfs_unbusy(mp);
13720                 if (ino == lastino) {
13721                         if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13722                                 softdep_error("clear_inodedeps: fsync1", error);
13723                 } else {
13724                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13725                                 softdep_error("clear_inodedeps: fsync2", error);
13726                         BO_LOCK(&vp->v_bufobj);
13727                         drain_output(vp);
13728                         BO_UNLOCK(&vp->v_bufobj);
13729                 }
13730                 vput(vp);
13731                 vn_finished_write(mp);
13732                 ACQUIRE_LOCK(ump);
13733         }
13734 }
13735
13736 void
13737 softdep_buf_append(bp, wkhd)
13738         struct buf *bp;
13739         struct workhead *wkhd;
13740 {
13741         struct worklist *wk;
13742         struct ufsmount *ump;
13743
13744         if ((wk = LIST_FIRST(wkhd)) == NULL)
13745                 return;
13746         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13747             ("softdep_buf_append called on non-softdep filesystem"));
13748         ump = VFSTOUFS(wk->wk_mp);
13749         ACQUIRE_LOCK(ump);
13750         while ((wk = LIST_FIRST(wkhd)) != NULL) {
13751                 WORKLIST_REMOVE(wk);
13752                 WORKLIST_INSERT(&bp->b_dep, wk);
13753         }
13754         FREE_LOCK(ump);
13755
13756 }
13757
13758 void
13759 softdep_inode_append(ip, cred, wkhd)
13760         struct inode *ip;
13761         struct ucred *cred;
13762         struct workhead *wkhd;
13763 {
13764         struct buf *bp;
13765         struct fs *fs;
13766         int error;
13767
13768         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
13769             ("softdep_inode_append called on non-softdep filesystem"));
13770         fs = ip->i_fs;
13771         error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13772             (int)fs->fs_bsize, cred, &bp);
13773         if (error) {
13774                 bqrelse(bp);
13775                 softdep_freework(wkhd);
13776                 return;
13777         }
13778         softdep_buf_append(bp, wkhd);
13779         bqrelse(bp);
13780 }
13781
13782 void
13783 softdep_freework(wkhd)
13784         struct workhead *wkhd;
13785 {
13786         struct worklist *wk;
13787         struct ufsmount *ump;
13788
13789         if ((wk = LIST_FIRST(wkhd)) == NULL)
13790                 return;
13791         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13792             ("softdep_freework called on non-softdep filesystem"));
13793         ump = VFSTOUFS(wk->wk_mp);
13794         ACQUIRE_LOCK(ump);
13795         handle_jwork(wkhd);
13796         FREE_LOCK(ump);
13797 }
13798
13799 /*
13800  * Function to determine if the buffer has outstanding dependencies
13801  * that will cause a roll-back if the buffer is written. If wantcount
13802  * is set, return number of dependencies, otherwise just yes or no.
13803  */
13804 static int
13805 softdep_count_dependencies(bp, wantcount)
13806         struct buf *bp;
13807         int wantcount;
13808 {
13809         struct worklist *wk;
13810         struct ufsmount *ump;
13811         struct bmsafemap *bmsafemap;
13812         struct freework *freework;
13813         struct inodedep *inodedep;
13814         struct indirdep *indirdep;
13815         struct freeblks *freeblks;
13816         struct allocindir *aip;
13817         struct pagedep *pagedep;
13818         struct dirrem *dirrem;
13819         struct newblk *newblk;
13820         struct mkdir *mkdir;
13821         struct diradd *dap;
13822         int i, retval;
13823
13824         retval = 0;
13825         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
13826                 return (0);
13827         ump = VFSTOUFS(wk->wk_mp);
13828         ACQUIRE_LOCK(ump);
13829         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13830                 switch (wk->wk_type) {
13831
13832                 case D_INODEDEP:
13833                         inodedep = WK_INODEDEP(wk);
13834                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13835                                 /* bitmap allocation dependency */
13836                                 retval += 1;
13837                                 if (!wantcount)
13838                                         goto out;
13839                         }
13840                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13841                                 /* direct block pointer dependency */
13842                                 retval += 1;
13843                                 if (!wantcount)
13844                                         goto out;
13845                         }
13846                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13847                                 /* direct block pointer dependency */
13848                                 retval += 1;
13849                                 if (!wantcount)
13850                                         goto out;
13851                         }
13852                         if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13853                                 /* Add reference dependency. */
13854                                 retval += 1;
13855                                 if (!wantcount)
13856                                         goto out;
13857                         }
13858                         continue;
13859
13860                 case D_INDIRDEP:
13861                         indirdep = WK_INDIRDEP(wk);
13862
13863                         TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13864                                 /* indirect truncation dependency */
13865                                 retval += 1;
13866                                 if (!wantcount)
13867                                         goto out;
13868                         }
13869
13870                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13871                                 /* indirect block pointer dependency */
13872                                 retval += 1;
13873                                 if (!wantcount)
13874                                         goto out;
13875                         }
13876                         continue;
13877
13878                 case D_PAGEDEP:
13879                         pagedep = WK_PAGEDEP(wk);
13880                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13881                                 if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13882                                         /* Journal remove ref dependency. */
13883                                         retval += 1;
13884                                         if (!wantcount)
13885                                                 goto out;
13886                                 }
13887                         }
13888                         for (i = 0; i < DAHASHSZ; i++) {
13889
13890                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13891                                         /* directory entry dependency */
13892                                         retval += 1;
13893                                         if (!wantcount)
13894                                                 goto out;
13895                                 }
13896                         }
13897                         continue;
13898
13899                 case D_BMSAFEMAP:
13900                         bmsafemap = WK_BMSAFEMAP(wk);
13901                         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13902                                 /* Add reference dependency. */
13903                                 retval += 1;
13904                                 if (!wantcount)
13905                                         goto out;
13906                         }
13907                         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13908                                 /* Allocate block dependency. */
13909                                 retval += 1;
13910                                 if (!wantcount)
13911                                         goto out;
13912                         }
13913                         continue;
13914
13915                 case D_FREEBLKS:
13916                         freeblks = WK_FREEBLKS(wk);
13917                         if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13918                                 /* Freeblk journal dependency. */
13919                                 retval += 1;
13920                                 if (!wantcount)
13921                                         goto out;
13922                         }
13923                         continue;
13924
13925                 case D_ALLOCDIRECT:
13926                 case D_ALLOCINDIR:
13927                         newblk = WK_NEWBLK(wk);
13928                         if (newblk->nb_jnewblk) {
13929                                 /* Journal allocate dependency. */
13930                                 retval += 1;
13931                                 if (!wantcount)
13932                                         goto out;
13933                         }
13934                         continue;
13935
13936                 case D_MKDIR:
13937                         mkdir = WK_MKDIR(wk);
13938                         if (mkdir->md_jaddref) {
13939                                 /* Journal reference dependency. */
13940                                 retval += 1;
13941                                 if (!wantcount)
13942                                         goto out;
13943                         }
13944                         continue;
13945
13946                 case D_FREEWORK:
13947                 case D_FREEDEP:
13948                 case D_JSEGDEP:
13949                 case D_JSEG:
13950                 case D_SBDEP:
13951                         /* never a dependency on these blocks */
13952                         continue;
13953
13954                 default:
13955                         panic("softdep_count_dependencies: Unexpected type %s",
13956                             TYPENAME(wk->wk_type));
13957                         /* NOTREACHED */
13958                 }
13959         }
13960 out:
13961         FREE_LOCK(ump);
13962         return retval;
13963 }
13964
13965 /*
13966  * Acquire exclusive access to a buffer.
13967  * Must be called with a locked mtx parameter.
13968  * Return acquired buffer or NULL on failure.
13969  */
13970 static struct buf *
13971 getdirtybuf(bp, lock, waitfor)
13972         struct buf *bp;
13973         struct rwlock *lock;
13974         int waitfor;
13975 {
13976         int error;
13977
13978         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13979                 if (waitfor != MNT_WAIT)
13980                         return (NULL);
13981                 error = BUF_LOCK(bp,
13982                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13983                 /*
13984                  * Even if we sucessfully acquire bp here, we have dropped
13985                  * lock, which may violates our guarantee.
13986                  */
13987                 if (error == 0)
13988                         BUF_UNLOCK(bp);
13989                 else if (error != ENOLCK)
13990                         panic("getdirtybuf: inconsistent lock: %d", error);
13991                 rw_wlock(lock);
13992                 return (NULL);
13993         }
13994         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13995                 if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
13996                         rw_wunlock(lock);
13997                         BO_LOCK(bp->b_bufobj);
13998                         BUF_UNLOCK(bp);
13999                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14000                                 bp->b_vflags |= BV_BKGRDWAIT;
14001                                 msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14002                                        PRIBIO | PDROP, "getbuf", 0);
14003                         } else
14004                                 BO_UNLOCK(bp->b_bufobj);
14005                         rw_wlock(lock);
14006                         return (NULL);
14007                 }
14008                 BUF_UNLOCK(bp);
14009                 if (waitfor != MNT_WAIT)
14010                         return (NULL);
14011                 /*
14012                  * The lock argument must be bp->b_vp's mutex in
14013                  * this case.
14014                  */
14015 #ifdef  DEBUG_VFS_LOCKS
14016                 if (bp->b_vp->v_type != VCHR)
14017                         ASSERT_BO_WLOCKED(bp->b_bufobj);
14018 #endif
14019                 bp->b_vflags |= BV_BKGRDWAIT;
14020                 rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14021                 return (NULL);
14022         }
14023         if ((bp->b_flags & B_DELWRI) == 0) {
14024                 BUF_UNLOCK(bp);
14025                 return (NULL);
14026         }
14027         bremfree(bp);
14028         return (bp);
14029 }
14030
14031
14032 /*
14033  * Check if it is safe to suspend the file system now.  On entry,
14034  * the vnode interlock for devvp should be held.  Return 0 with
14035  * the mount interlock held if the file system can be suspended now,
14036  * otherwise return EAGAIN with the mount interlock held.
14037  */
14038 int
14039 softdep_check_suspend(struct mount *mp,
14040                       struct vnode *devvp,
14041                       int softdep_depcnt,
14042                       int softdep_accdepcnt,
14043                       int secondary_writes,
14044                       int secondary_accwrites)
14045 {
14046         struct bufobj *bo;
14047         struct ufsmount *ump;
14048         struct inodedep *inodedep;
14049         int error, unlinked;
14050
14051         bo = &devvp->v_bufobj;
14052         ASSERT_BO_WLOCKED(bo);
14053
14054         /*
14055          * If we are not running with soft updates, then we need only
14056          * deal with secondary writes as we try to suspend.
14057          */
14058         if (MOUNTEDSOFTDEP(mp) == 0) {
14059                 MNT_ILOCK(mp);
14060                 while (mp->mnt_secondary_writes != 0) {
14061                         BO_UNLOCK(bo);
14062                         msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14063                             (PUSER - 1) | PDROP, "secwr", 0);
14064                         BO_LOCK(bo);
14065                         MNT_ILOCK(mp);
14066                 }
14067
14068                 /*
14069                  * Reasons for needing more work before suspend:
14070                  * - Dirty buffers on devvp.
14071                  * - Secondary writes occurred after start of vnode sync loop
14072                  */
14073                 error = 0;
14074                 if (bo->bo_numoutput > 0 ||
14075                     bo->bo_dirty.bv_cnt > 0 ||
14076                     secondary_writes != 0 ||
14077                     mp->mnt_secondary_writes != 0 ||
14078                     secondary_accwrites != mp->mnt_secondary_accwrites)
14079                         error = EAGAIN;
14080                 BO_UNLOCK(bo);
14081                 return (error);
14082         }
14083
14084         /*
14085          * If we are running with soft updates, then we need to coordinate
14086          * with them as we try to suspend.
14087          */
14088         ump = VFSTOUFS(mp);
14089         for (;;) {
14090                 if (!TRY_ACQUIRE_LOCK(ump)) {
14091                         BO_UNLOCK(bo);
14092                         ACQUIRE_LOCK(ump);
14093                         FREE_LOCK(ump);
14094                         BO_LOCK(bo);
14095                         continue;
14096                 }
14097                 MNT_ILOCK(mp);
14098                 if (mp->mnt_secondary_writes != 0) {
14099                         FREE_LOCK(ump);
14100                         BO_UNLOCK(bo);
14101                         msleep(&mp->mnt_secondary_writes,
14102                                MNT_MTX(mp),
14103                                (PUSER - 1) | PDROP, "secwr", 0);
14104                         BO_LOCK(bo);
14105                         continue;
14106                 }
14107                 break;
14108         }
14109
14110         unlinked = 0;
14111         if (MOUNTEDSUJ(mp)) {
14112                 for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14113                     inodedep != NULL;
14114                     inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14115                         if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14116                             UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14117                             UNLINKONLIST) ||
14118                             !check_inodedep_free(inodedep))
14119                                 continue;
14120                         unlinked++;
14121                 }
14122         }
14123
14124         /*
14125          * Reasons for needing more work before suspend:
14126          * - Dirty buffers on devvp.
14127          * - Softdep activity occurred after start of vnode sync loop
14128          * - Secondary writes occurred after start of vnode sync loop
14129          */
14130         error = 0;
14131         if (bo->bo_numoutput > 0 ||
14132             bo->bo_dirty.bv_cnt > 0 ||
14133             softdep_depcnt != unlinked ||
14134             ump->softdep_deps != unlinked ||
14135             softdep_accdepcnt != ump->softdep_accdeps ||
14136             secondary_writes != 0 ||
14137             mp->mnt_secondary_writes != 0 ||
14138             secondary_accwrites != mp->mnt_secondary_accwrites)
14139                 error = EAGAIN;
14140         FREE_LOCK(ump);
14141         BO_UNLOCK(bo);
14142         return (error);
14143 }
14144
14145
14146 /*
14147  * Get the number of dependency structures for the file system, both
14148  * the current number and the total number allocated.  These will
14149  * later be used to detect that softdep processing has occurred.
14150  */
14151 void
14152 softdep_get_depcounts(struct mount *mp,
14153                       int *softdep_depsp,
14154                       int *softdep_accdepsp)
14155 {
14156         struct ufsmount *ump;
14157
14158         if (MOUNTEDSOFTDEP(mp) == 0) {
14159                 *softdep_depsp = 0;
14160                 *softdep_accdepsp = 0;
14161                 return;
14162         }
14163         ump = VFSTOUFS(mp);
14164         ACQUIRE_LOCK(ump);
14165         *softdep_depsp = ump->softdep_deps;
14166         *softdep_accdepsp = ump->softdep_accdeps;
14167         FREE_LOCK(ump);
14168 }
14169
14170 /*
14171  * Wait for pending output on a vnode to complete.
14172  * Must be called with vnode lock and interlock locked.
14173  *
14174  * XXX: Should just be a call to bufobj_wwait().
14175  */
14176 static void
14177 drain_output(vp)
14178         struct vnode *vp;
14179 {
14180         struct bufobj *bo;
14181
14182         bo = &vp->v_bufobj;
14183         ASSERT_VOP_LOCKED(vp, "drain_output");
14184         ASSERT_BO_WLOCKED(bo);
14185
14186         while (bo->bo_numoutput) {
14187                 bo->bo_flag |= BO_WWAIT;
14188                 msleep((caddr_t)&bo->bo_numoutput,
14189                     BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
14190         }
14191 }
14192
14193 /*
14194  * Called whenever a buffer that is being invalidated or reallocated
14195  * contains dependencies. This should only happen if an I/O error has
14196  * occurred. The routine is called with the buffer locked.
14197  */
14198 static void
14199 softdep_deallocate_dependencies(bp)
14200         struct buf *bp;
14201 {
14202
14203         if ((bp->b_ioflags & BIO_ERROR) == 0)
14204                 panic("softdep_deallocate_dependencies: dangling deps");
14205         if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14206                 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14207         else
14208                 printf("softdep_deallocate_dependencies: "
14209                     "got error %d while accessing filesystem\n", bp->b_error);
14210         if (bp->b_error != ENXIO)
14211                 panic("softdep_deallocate_dependencies: unrecovered I/O error");
14212 }
14213
14214 /*
14215  * Function to handle asynchronous write errors in the filesystem.
14216  */
14217 static void
14218 softdep_error(func, error)
14219         char *func;
14220         int error;
14221 {
14222
14223         /* XXX should do something better! */
14224         printf("%s: got error %d while accessing filesystem\n", func, error);
14225 }
14226
14227 #ifdef DDB
14228
14229 static void
14230 inodedep_print(struct inodedep *inodedep, int verbose)
14231 {
14232         db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
14233             " saveino %p\n",
14234             inodedep, inodedep->id_fs, inodedep->id_state,
14235             (intmax_t)inodedep->id_ino,
14236             (intmax_t)fsbtodb(inodedep->id_fs,
14237             ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14238             inodedep->id_nlinkdelta, inodedep->id_savednlink,
14239             inodedep->id_savedino1);
14240
14241         if (verbose == 0)
14242                 return;
14243
14244         db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14245             "mkdiradd %p\n",
14246             LIST_FIRST(&inodedep->id_pendinghd),
14247             LIST_FIRST(&inodedep->id_bufwait),
14248             LIST_FIRST(&inodedep->id_inowait),
14249             TAILQ_FIRST(&inodedep->id_inoreflst),
14250             inodedep->id_mkdiradd);
14251         db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14252             TAILQ_FIRST(&inodedep->id_inoupdt),
14253             TAILQ_FIRST(&inodedep->id_newinoupdt),
14254             TAILQ_FIRST(&inodedep->id_extupdt),
14255             TAILQ_FIRST(&inodedep->id_newextupdt));
14256 }
14257
14258 DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14259 {
14260
14261         if (have_addr == 0) {
14262                 db_printf("Address required\n");
14263                 return;
14264         }
14265         inodedep_print((struct inodedep*)addr, 1);
14266 }
14267
14268 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14269 {
14270         struct inodedep_hashhead *inodedephd;
14271         struct inodedep *inodedep;
14272         struct ufsmount *ump;
14273         int cnt;
14274
14275         if (have_addr == 0) {
14276                 db_printf("Address required\n");
14277                 return;
14278         }
14279         ump = (struct ufsmount *)addr;
14280         for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14281                 inodedephd = &ump->inodedep_hashtbl[cnt];
14282                 LIST_FOREACH(inodedep, inodedephd, id_hash) {
14283                         inodedep_print(inodedep, 0);
14284                 }
14285         }
14286 }
14287
14288 DB_SHOW_COMMAND(worklist, db_show_worklist)
14289 {
14290         struct worklist *wk;
14291
14292         if (have_addr == 0) {
14293                 db_printf("Address required\n");
14294                 return;
14295         }
14296         wk = (struct worklist *)addr;
14297         printf("worklist: %p type %s state 0x%X\n",
14298             wk, TYPENAME(wk->wk_type), wk->wk_state);
14299 }
14300
14301 DB_SHOW_COMMAND(workhead, db_show_workhead)
14302 {
14303         struct workhead *wkhd;
14304         struct worklist *wk;
14305         int i;
14306
14307         if (have_addr == 0) {
14308                 db_printf("Address required\n");
14309                 return;
14310         }
14311         wkhd = (struct workhead *)addr;
14312         wk = LIST_FIRST(wkhd);
14313         for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14314                 db_printf("worklist: %p type %s state 0x%X",
14315                     wk, TYPENAME(wk->wk_type), wk->wk_state);
14316         if (i == 100)
14317                 db_printf("workhead overflow");
14318         printf("\n");
14319 }
14320
14321
14322 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14323 {
14324         struct mkdirlist *mkdirlisthd;
14325         struct jaddref *jaddref;
14326         struct diradd *diradd;
14327         struct mkdir *mkdir;
14328
14329         if (have_addr == 0) {
14330                 db_printf("Address required\n");
14331                 return;
14332         }
14333         mkdirlisthd = (struct mkdirlist *)addr;
14334         LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14335                 diradd = mkdir->md_diradd;
14336                 db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14337                     mkdir, mkdir->md_state, diradd, diradd->da_state);
14338                 if ((jaddref = mkdir->md_jaddref) != NULL)
14339                         db_printf(" jaddref %p jaddref state 0x%X",
14340                             jaddref, jaddref->ja_state);
14341                 db_printf("\n");
14342         }
14343 }
14344
14345 /* exported to ffs_vfsops.c */
14346 extern void db_print_ffs(struct ufsmount *ump);
14347 void
14348 db_print_ffs(struct ufsmount *ump)
14349 {
14350         db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14351             ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14352             ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14353             ump->softdep_deps, ump->softdep_req);
14354 }
14355
14356 #endif /* DDB */
14357
14358 #endif /* SOFTUPDATES */