sys/ufs/ffs/ffs_softdep.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause
   3  *
   4  * Copyright 1998, 2000 Marshall Kirk McKusick.
   5  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
   6  * All rights reserved.
   7  *
   8  * The soft updates code is derived from the appendix of a University
   9  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  10  * "Soft Updates: A Solution to the Metadata Update Problem in File
  11  * Systems", CSE-TR-254-95, August 1995).
  12  *
  13  * Further information about soft updates can be obtained from:
  14  *
  15  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  16  *      1614 Oxford Street              mckusick@mckusick.com
  17  *      Berkeley, CA 94709-1608         +1-510-843-9542
  18  *      USA
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  * 1. Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  * 2. Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in the
  28  *    documentation and/or other materials provided with the distribution.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  31  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  33  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  34  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  35  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  36  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  37  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
  38  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  39  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  40  *
  41  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  42  */
  43
  44 #include <sys/cdefs.h>
  45 #include "opt_ffs.h"
  46 #include "opt_quota.h"
  47 #include "opt_ddb.h"
  48
  49 #include <sys/param.h>
  50 #include <sys/kernel.h>
  51 #include <sys/systm.h>
  52 #include <sys/bio.h>
  53 #include <sys/buf.h>
  54 #include <sys/kdb.h>
  55 #include <sys/kthread.h>
  56 #include <sys/ktr.h>
  57 #include <sys/limits.h>
  58 #include <sys/lock.h>
  59 #include <sys/malloc.h>
  60 #include <sys/mount.h>
  61 #include <sys/mutex.h>
  62 #include <sys/namei.h>
  63 #include <sys/priv.h>
  64 #include <sys/proc.h>
  65 #include <sys/racct.h>
  66 #include <sys/rwlock.h>
  67 #include <sys/stat.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/syslog.h>
  70 #include <sys/vnode.h>
  71 #include <sys/conf.h>
  72
  73 #include <ufs/ufs/dir.h>
  74 #include <ufs/ufs/extattr.h>
  75 #include <ufs/ufs/quota.h>
  76 #include <ufs/ufs/inode.h>
  77 #include <ufs/ufs/ufsmount.h>
  78 #include <ufs/ffs/fs.h>
  79 #include <ufs/ffs/softdep.h>
  80 #include <ufs/ffs/ffs_extern.h>
  81 #include <ufs/ufs/ufs_extern.h>
  82
  83 #include <vm/vm.h>
  84 #include <vm/vm_extern.h>
  85 #include <vm/vm_object.h>
  86
  87 #include <geom/geom.h>
  88 #include <geom/geom_vfs.h>
  89
  90 #include <ddb/ddb.h>
  91
  92 #define KTR_SUJ 0       /* Define to KTR_SPARE. */
  93
  94 #ifndef SOFTUPDATES
  95
  96 int
  97 softdep_flushfiles(struct mount *oldmnt,
  98         int flags,
  99         struct thread *td)
 100 {
 101
 102         panic("softdep_flushfiles called");
 103 }
 104
 105 int
 106 softdep_mount(struct vnode *devvp,
 107         struct mount *mp,
 108         struct fs *fs,
 109         struct ucred *cred)
 110 {
 111
 112         return (0);
 113 }
 114
 115 void
 116 softdep_initialize(void)
 117 {
 118
 119         return;
 120 }
 121
 122 void
 123 softdep_uninitialize(void)
 124 {
 125
 126         return;
 127 }
 128
 129 void
 130 softdep_unmount(struct mount *mp)
 131 {
 132
 133         panic("softdep_unmount called");
 134 }
 135
 136 void
 137 softdep_setup_sbupdate(struct ufsmount *ump,
 138         struct fs *fs,
 139         struct buf *bp)
 140 {
 141
 142         panic("softdep_setup_sbupdate called");
 143 }
 144
 145 void
 146 softdep_setup_inomapdep(struct buf *bp,
 147         struct inode *ip,
 148         ino_t newinum,
 149         int mode)
 150 {
 151
 152         panic("softdep_setup_inomapdep called");
 153 }
 154
 155 void
 156 softdep_setup_blkmapdep(struct buf *bp,
 157         struct mount *mp,
 158         ufs2_daddr_t newblkno,
 159         int frags,
 160         int oldfrags)
 161 {
 162
 163         panic("softdep_setup_blkmapdep called");
 164 }
 165
 166 void
 167 softdep_setup_allocdirect(struct inode *ip,
 168         ufs_lbn_t lbn,
 169         ufs2_daddr_t newblkno,
 170         ufs2_daddr_t oldblkno,
 171         long newsize,
 172         long oldsize,
 173         struct buf *bp)
 174 {
 175
 176         panic("softdep_setup_allocdirect called");
 177 }
 178
 179 void
 180 softdep_setup_allocext(struct inode *ip,
 181         ufs_lbn_t lbn,
 182         ufs2_daddr_t newblkno,
 183         ufs2_daddr_t oldblkno,
 184         long newsize,
 185         long oldsize,
 186         struct buf *bp)
 187 {
 188
 189         panic("softdep_setup_allocext called");
 190 }
 191
 192 void
 193 softdep_setup_allocindir_page(struct inode *ip,
 194         ufs_lbn_t lbn,
 195         struct buf *bp,
 196         int ptrno,
 197         ufs2_daddr_t newblkno,
 198         ufs2_daddr_t oldblkno,
 199         struct buf *nbp)
 200 {
 201
 202         panic("softdep_setup_allocindir_page called");
 203 }
 204
 205 void
 206 softdep_setup_allocindir_meta(struct buf *nbp,
 207         struct inode *ip,
 208         struct buf *bp,
 209         int ptrno,
 210         ufs2_daddr_t newblkno)
 211 {
 212
 213         panic("softdep_setup_allocindir_meta called");
 214 }
 215
 216 void
 217 softdep_journal_freeblocks(struct inode *ip,
 218         struct ucred *cred,
 219         off_t length,
 220         int flags)
 221 {
 222
 223         panic("softdep_journal_freeblocks called");
 224 }
 225
 226 void
 227 softdep_journal_fsync(struct inode *ip)
 228 {
 229
 230         panic("softdep_journal_fsync called");
 231 }
 232
 233 void
 234 softdep_setup_freeblocks(struct inode *ip,
 235         off_t length,
 236         int flags)
 237 {
 238
 239         panic("softdep_setup_freeblocks called");
 240 }
 241
 242 void
 243 softdep_freefile(struct vnode *pvp,
 244                 ino_t ino,
 245                 int mode)
 246 {
 247
 248         panic("softdep_freefile called");
 249 }
 250
 251 int
 252 softdep_setup_directory_add(struct buf *bp,
 253         struct inode *dp,
 254         off_t diroffset,
 255         ino_t newinum,
 256         struct buf *newdirbp,
 257         int isnewblk)
 258 {
 259
 260         panic("softdep_setup_directory_add called");
 261 }
 262
 263 void
 264 softdep_change_directoryentry_offset(struct buf *bp,
 265         struct inode *dp,
 266         caddr_t base,
 267         caddr_t oldloc,
 268         caddr_t newloc,
 269         int entrysize)
 270 {
 271
 272         panic("softdep_change_directoryentry_offset called");
 273 }
 274
 275 void
 276 softdep_setup_remove(struct buf *bp,
 277         struct inode *dp,
 278         struct inode *ip,
 279         int isrmdir)
 280 {
 281
 282         panic("softdep_setup_remove called");
 283 }
 284
 285 void
 286 softdep_setup_directory_change(struct buf *bp,
 287         struct inode *dp,
 288         struct inode *ip,
 289         ino_t newinum,
 290         int isrmdir)
 291 {
 292
 293         panic("softdep_setup_directory_change called");
 294 }
 295
 296 void
 297 softdep_setup_blkfree(struct mount *mp,
 298         struct buf *bp,
 299         ufs2_daddr_t blkno,
 300         int frags,
 301         struct workhead *wkhd,
 302         bool doingrecovery)
 303 {
 304
 305         panic("%s called", __FUNCTION__);
 306 }
 307
 308 void
 309 softdep_setup_inofree(struct mount *mp,
 310         struct buf *bp,
 311         ino_t ino,
 312         struct workhead *wkhd,
 313         bool doingrecovery)
 314 {
 315
 316         panic("%s called", __FUNCTION__);
 317 }
 318
 319 void
 320 softdep_setup_unlink(struct inode *dp, struct inode *ip)
 321 {
 322
 323         panic("%s called", __FUNCTION__);
 324 }
 325
 326 void
 327 softdep_setup_link(struct inode *dp, struct inode *ip)
 328 {
 329
 330         panic("%s called", __FUNCTION__);
 331 }
 332
 333 void
 334 softdep_revert_link(struct inode *dp, struct inode *ip)
 335 {
 336
 337         panic("%s called", __FUNCTION__);
 338 }
 339
 340 void
 341 softdep_setup_rmdir(struct inode *dp, struct inode *ip)
 342 {
 343
 344         panic("%s called", __FUNCTION__);
 345 }
 346
 347 void
 348 softdep_revert_rmdir(struct inode *dp, struct inode *ip)
 349 {
 350
 351         panic("%s called", __FUNCTION__);
 352 }
 353
 354 void
 355 softdep_setup_create(struct inode *dp, struct inode *ip)
 356 {
 357
 358         panic("%s called", __FUNCTION__);
 359 }
 360
 361 void
 362 softdep_revert_create(struct inode *dp, struct inode *ip)
 363 {
 364
 365         panic("%s called", __FUNCTION__);
 366 }
 367
 368 void
 369 softdep_setup_mkdir(struct inode *dp, struct inode *ip)
 370 {
 371
 372         panic("%s called", __FUNCTION__);
 373 }
 374
 375 void
 376 softdep_revert_mkdir(struct inode *dp, struct inode *ip)
 377 {
 378
 379         panic("%s called", __FUNCTION__);
 380 }
 381
 382 void
 383 softdep_setup_dotdot_link(struct inode *dp, struct inode *ip)
 384 {
 385
 386         panic("%s called", __FUNCTION__);
 387 }
 388
 389 int
 390 softdep_prealloc(struct vnode *vp, int waitok)
 391 {
 392
 393         panic("%s called", __FUNCTION__);
 394 }
 395
 396 int
 397 softdep_journal_lookup(struct mount *mp, struct vnode **vpp)
 398 {
 399
 400         return (ENOENT);
 401 }
 402
 403 void
 404 softdep_change_linkcnt(struct inode *ip)
 405 {
 406
 407         panic("softdep_change_linkcnt called");
 408 }
 409
 410 void
 411 softdep_load_inodeblock(struct inode *ip)
 412 {
 413
 414         panic("softdep_load_inodeblock called");
 415 }
 416
 417 void
 418 softdep_update_inodeblock(struct inode *ip,
 419         struct buf *bp,
 420         int waitfor)
 421 {
 422
 423         panic("softdep_update_inodeblock called");
 424 }
 425
 426 int
 427 softdep_fsync(struct vnode *vp) /* the "in_core" copy of the inode */
 428 {
 429
 430         return (0);
 431 }
 432
 433 void
 434 softdep_fsync_mountdev(struct vnode *vp)
 435 {
 436
 437         return;
 438 }
 439
 440 int
 441 softdep_flushworklist(struct mount *oldmnt,
 442         int *countp,
 443         struct thread *td)
 444 {
 445
 446         *countp = 0;
 447         return (0);
 448 }
 449
 450 int
 451 softdep_sync_metadata(struct vnode *vp)
 452 {
 453
 454         panic("softdep_sync_metadata called");
 455 }
 456
 457 int
 458 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 459 {
 460
 461         panic("softdep_sync_buf called");
 462 }
 463
 464 int
 465 softdep_slowdown(struct vnode *vp)
 466 {
 467
 468         panic("softdep_slowdown called");
 469 }
 470
 471 int
 472 softdep_request_cleanup(struct fs *fs,
 473         struct vnode *vp,
 474         struct ucred *cred,
 475         int resource)
 476 {
 477
 478         return (0);
 479 }
 480
 481 int
 482 softdep_check_suspend(struct mount *mp,
 483                       struct vnode *devvp,
 484                       int softdep_depcnt,
 485                       int softdep_accdepcnt,
 486                       int secondary_writes,
 487                       int secondary_accwrites)
 488 {
 489         struct bufobj *bo;
 490         int error;
 491
 492         (void) softdep_depcnt,
 493         (void) softdep_accdepcnt;
 494
 495         bo = &devvp->v_bufobj;
 496         ASSERT_BO_WLOCKED(bo);
 497
 498         MNT_ILOCK(mp);
 499         while (mp->mnt_secondary_writes != 0) {
 500                 BO_UNLOCK(bo);
 501                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 502                     (PUSER - 1) | PDROP, "secwr", 0);
 503                 BO_LOCK(bo);
 504                 MNT_ILOCK(mp);
 505         }
 506
 507         /*
 508          * Reasons for needing more work before suspend:
 509          * - Dirty buffers on devvp.
 510          * - Secondary writes occurred after start of vnode sync loop
 511          */
 512         error = 0;
 513         if (bo->bo_numoutput > 0 ||
 514             bo->bo_dirty.bv_cnt > 0 ||
 515             secondary_writes != 0 ||
 516             mp->mnt_secondary_writes != 0 ||
 517             secondary_accwrites != mp->mnt_secondary_accwrites)
 518                 error = EAGAIN;
 519         BO_UNLOCK(bo);
 520         return (error);
 521 }
 522
 523 void
 524 softdep_get_depcounts(struct mount *mp,
 525                       int *softdepactivep,
 526                       int *softdepactiveaccp)
 527 {
 528         (void) mp;
 529         *softdepactivep = 0;
 530         *softdepactiveaccp = 0;
 531 }
 532
 533 void
 534 softdep_buf_append(struct buf *bp, struct workhead *wkhd)
 535 {
 536
 537         panic("softdep_buf_appendwork called");
 538 }
 539
 540 void
 541 softdep_inode_append(struct inode *ip,
 542         struct ucred *cred,
 543         struct workhead *wkhd)
 544 {
 545
 546         panic("softdep_inode_appendwork called");
 547 }
 548
 549 void
 550 softdep_freework(struct workhead *wkhd)
 551 {
 552
 553         panic("softdep_freework called");
 554 }
 555
 556 int
 557 softdep_prerename(struct vnode *fdvp,
 558         struct vnode *fvp,
 559         struct vnode *tdvp,
 560         struct vnode *tvp)
 561 {
 562
 563         panic("softdep_prerename called");
 564 }
 565
 566 int
 567 softdep_prelink(struct vnode *dvp,
 568         struct vnode *vp,
 569         struct componentname *cnp)
 570 {
 571
 572         panic("softdep_prelink called");
 573 }
 574
 575 #else
 576
 577 FEATURE(softupdates, "FFS soft-updates support");
 578
 579 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 580     "soft updates stats");
 581 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total,
 582     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 583     "total dependencies allocated");
 584 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse,
 585     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 586     "high use dependencies allocated");
 587 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current,
 588     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 589     "current dependencies allocated");
 590 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write,
 591     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 592     "current dependencies written");
 593
 594 unsigned long dep_current[D_LAST + 1];
 595 unsigned long dep_highuse[D_LAST + 1];
 596 unsigned long dep_total[D_LAST + 1];
 597 unsigned long dep_write[D_LAST + 1];
 598
 599 #define SOFTDEP_TYPE(type, str, long)                                   \
 600     static MALLOC_DEFINE(M_ ## type, #str, long);                       \
 601     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
 602         &dep_total[D_ ## type], 0, "");                                 \
 603     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
 604         &dep_current[D_ ## type], 0, "");                               \
 605     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD,     \
 606         &dep_highuse[D_ ## type], 0, "");                               \
 607     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,       \
 608         &dep_write[D_ ## type], 0, "");
 609
 610 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
 611 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
 612 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
 613     "Block or frag allocated from cyl group map");
 614 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
 615 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
 616 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
 617 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
 618 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
 619 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
 620 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
 621 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
 622 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
 623 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
 624 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
 625 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
 626 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
 627 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
 628 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
 629 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
 630 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
 631 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
 632 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
 633 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
 634 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
 635 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
 636 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
 637 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
 638
 639 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
 640
 641 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
 642 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
 643 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
 644
 645 #define M_SOFTDEP_FLAGS (M_WAITOK)
 646
 647 /*
 648  * translate from workitem type to memory type
 649  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 650  */
 651 static struct malloc_type *memtype[] = {
 652         NULL,
 653         M_PAGEDEP,
 654         M_INODEDEP,
 655         M_BMSAFEMAP,
 656         M_NEWBLK,
 657         M_ALLOCDIRECT,
 658         M_INDIRDEP,
 659         M_ALLOCINDIR,
 660         M_FREEFRAG,
 661         M_FREEBLKS,
 662         M_FREEFILE,
 663         M_DIRADD,
 664         M_MKDIR,
 665         M_DIRREM,
 666         M_NEWDIRBLK,
 667         M_FREEWORK,
 668         M_FREEDEP,
 669         M_JADDREF,
 670         M_JREMREF,
 671         M_JMVREF,
 672         M_JNEWBLK,
 673         M_JFREEBLK,
 674         M_JFREEFRAG,
 675         M_JSEG,
 676         M_JSEGDEP,
 677         M_SBDEP,
 678         M_JTRUNC,
 679         M_JFSYNC,
 680         M_SENTINEL
 681 };
 682
 683 #define DtoM(type) (memtype[type])
 684
 685 /*
 686  * Names of malloc types.
 687  */
 688 #define TYPENAME(type)  \
 689         ((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
 690         memtype[type]->ks_shortdesc : "???")
 691 /*
 692  * End system adaptation definitions.
 693  */
 694
 695 #define DOTDOT_OFFSET   offsetof(struct dirtemplate, dotdot_ino)
 696 #define DOT_OFFSET      offsetof(struct dirtemplate, dot_ino)
 697
 698 /*
 699  * Internal function prototypes.
 700  */
 701 static  void check_clear_deps(struct mount *);
 702 static  void softdep_error(char *, int);
 703 static  int softdep_prerename_vnode(struct ufsmount *, struct vnode *);
 704 static  int softdep_process_worklist(struct mount *, int);
 705 static  int softdep_waitidle(struct mount *, int);
 706 static  void drain_output(struct vnode *);
 707 static  struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
 708 static  int check_inodedep_free(struct inodedep *);
 709 static  void clear_remove(struct mount *);
 710 static  void clear_inodedeps(struct mount *);
 711 static  void unlinked_inodedep(struct mount *, struct inodedep *);
 712 static  void clear_unlinked_inodedep(struct inodedep *);
 713 static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 714 static  int flush_pagedep_deps(struct vnode *, struct mount *,
 715             struct diraddhd *, struct buf *);
 716 static  int free_pagedep(struct pagedep *);
 717 static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 718 static  int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 719 static  int flush_deplist(struct allocdirectlst *, int, int *);
 720 static  int sync_cgs(struct mount *, int);
 721 static  int handle_written_filepage(struct pagedep *, struct buf *, int);
 722 static  int handle_written_sbdep(struct sbdep *, struct buf *);
 723 static  void initiate_write_sbdep(struct sbdep *);
 724 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 725 static  int handle_written_indirdep(struct indirdep *, struct buf *,
 726             struct buf**, int);
 727 static  int handle_written_inodeblock(struct inodedep *, struct buf *, int);
 728 static  int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 729             uint8_t *);
 730 static  int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
 731 static  void handle_written_jaddref(struct jaddref *);
 732 static  void handle_written_jremref(struct jremref *);
 733 static  void handle_written_jseg(struct jseg *, struct buf *);
 734 static  void handle_written_jnewblk(struct jnewblk *);
 735 static  void handle_written_jblkdep(struct jblkdep *);
 736 static  void handle_written_jfreefrag(struct jfreefrag *);
 737 static  void complete_jseg(struct jseg *);
 738 static  void complete_jsegs(struct jseg *);
 739 static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
 740 static  void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
 741 static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
 742 static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
 743 static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
 744 static  void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
 745 static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
 746 static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
 747 static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
 748 static  inline void inoref_write(struct inoref *, struct jseg *,
 749             struct jrefrec *);
 750 static  void handle_allocdirect_partdone(struct allocdirect *,
 751             struct workhead *);
 752 static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
 753             struct workhead *);
 754 static  void indirdep_complete(struct indirdep *);
 755 static  int indirblk_lookup(struct mount *, ufs2_daddr_t);
 756 static  void indirblk_insert(struct freework *);
 757 static  void indirblk_remove(struct freework *);
 758 static  void handle_allocindir_partdone(struct allocindir *);
 759 static  void initiate_write_filepage(struct pagedep *, struct buf *);
 760 static  void initiate_write_indirdep(struct indirdep*, struct buf *);
 761 static  void handle_written_mkdir(struct mkdir *, int);
 762 static  int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
 763             uint8_t *);
 764 static  void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 765 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 766 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 767 static  void handle_workitem_freefile(struct freefile *);
 768 static  int handle_workitem_remove(struct dirrem *, int);
 769 static  struct dirrem *newdirrem(struct buf *, struct inode *,
 770             struct inode *, int, struct dirrem **);
 771 static  struct indirdep *indirdep_lookup(struct mount *, struct inode *,
 772             struct buf *);
 773 static  void cancel_indirdep(struct indirdep *, struct buf *,
 774             struct freeblks *);
 775 static  void free_indirdep(struct indirdep *);
 776 static  void free_diradd(struct diradd *, struct workhead *);
 777 static  void merge_diradd(struct inodedep *, struct diradd *);
 778 static  void complete_diradd(struct diradd *);
 779 static  struct diradd *diradd_lookup(struct pagedep *, int);
 780 static  struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
 781             struct jremref *);
 782 static  struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
 783             struct jremref *);
 784 static  void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
 785             struct jremref *, struct jremref *);
 786 static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
 787             struct jremref *);
 788 static  void cancel_allocindir(struct allocindir *, struct buf *bp,
 789             struct freeblks *, int);
 790 static  int setup_trunc_indir(struct freeblks *, struct inode *,
 791             ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
 792 static  void complete_trunc_indir(struct freework *);
 793 static  void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
 794             int);
 795 static  void complete_mkdir(struct mkdir *);
 796 static  void free_newdirblk(struct newdirblk *);
 797 static  void free_jremref(struct jremref *);
 798 static  void free_jaddref(struct jaddref *);
 799 static  void free_jsegdep(struct jsegdep *);
 800 static  void free_jsegs(struct jblocks *);
 801 static  void rele_jseg(struct jseg *);
 802 static  void free_jseg(struct jseg *, struct jblocks *);
 803 static  void free_jnewblk(struct jnewblk *);
 804 static  void free_jblkdep(struct jblkdep *);
 805 static  void free_jfreefrag(struct jfreefrag *);
 806 static  void free_freedep(struct freedep *);
 807 static  void journal_jremref(struct dirrem *, struct jremref *,
 808             struct inodedep *);
 809 static  void cancel_jnewblk(struct jnewblk *, struct workhead *);
 810 static  int cancel_jaddref(struct jaddref *, struct inodedep *,
 811             struct workhead *);
 812 static  void cancel_jfreefrag(struct jfreefrag *);
 813 static  inline void setup_freedirect(struct freeblks *, struct inode *,
 814             int, int);
 815 static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
 816 static  inline void setup_freeindir(struct freeblks *, struct inode *, int,
 817             ufs_lbn_t, int);
 818 static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
 819 static  void freeblks_free(struct ufsmount *, struct freeblks *, int);
 820 static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
 821 static  ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
 822 static  int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
 823 static  void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
 824             int, int);
 825 static  void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
 826 static  int cancel_pagedep(struct pagedep *, struct freeblks *, int);
 827 static  int deallocate_dependencies(struct buf *, struct freeblks *, int);
 828 static  void newblk_freefrag(struct newblk*);
 829 static  void free_newblk(struct newblk *);
 830 static  void cancel_allocdirect(struct allocdirectlst *,
 831             struct allocdirect *, struct freeblks *);
 832 static  int check_inode_unwritten(struct inodedep *);
 833 static  int free_inodedep(struct inodedep *);
 834 static  void freework_freeblock(struct freework *, uint64_t);
 835 static  void freework_enqueue(struct freework *);
 836 static  int handle_workitem_freeblocks(struct freeblks *, int);
 837 static  int handle_complete_freeblocks(struct freeblks *, int);
 838 static  void handle_workitem_indirblk(struct freework *);
 839 static  void handle_written_freework(struct freework *);
 840 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 841 static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
 842             struct workhead *);
 843 static  struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
 844             struct inodedep *, struct allocindir *, ufs_lbn_t);
 845 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 846             ufs2_daddr_t, ufs_lbn_t);
 847 static  void handle_workitem_freefrag(struct freefrag *);
 848 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
 849             ufs_lbn_t, uint64_t);
 850 static  void allocdirect_merge(struct allocdirectlst *,
 851             struct allocdirect *, struct allocdirect *);
 852 static  struct freefrag *allocindir_merge(struct allocindir *,
 853             struct allocindir *);
 854 static  int bmsafemap_find(struct bmsafemap_hashhead *, int,
 855             struct bmsafemap **);
 856 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
 857             int cg, struct bmsafemap *);
 858 static  int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
 859             struct newblk **);
 860 static  int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
 861 static  int inodedep_find(struct inodedep_hashhead *, ino_t,
 862             struct inodedep **);
 863 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 864 static  int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
 865             int, struct pagedep **);
 866 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 867             struct pagedep **);
 868 static  void pause_timer(void *);
 869 static  int request_cleanup(struct mount *, int);
 870 static  int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
 871 static  void schedule_cleanup(struct mount *);
 872 static void softdep_ast_cleanup_proc(struct thread *);
 873 static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
 874 static  int process_worklist_item(struct mount *, int, int);
 875 static  void process_removes(struct vnode *);
 876 static  void process_truncates(struct vnode *);
 877 static  void jwork_move(struct workhead *, struct workhead *);
 878 static  void jwork_insert(struct workhead *, struct jsegdep *);
 879 static  void add_to_worklist(struct worklist *, int);
 880 static  void wake_worklist(struct worklist *);
 881 static  void wait_worklist(struct worklist *, char *);
 882 static  void remove_from_worklist(struct worklist *);
 883 static  void softdep_flush(void *);
 884 static  void softdep_flushjournal(struct mount *);
 885 static  int softdep_speedup(struct ufsmount *);
 886 static  void worklist_speedup(struct mount *);
 887 static  int journal_mount(struct mount *, struct fs *, struct ucred *);
 888 static  void journal_unmount(struct ufsmount *);
 889 static  int journal_space(struct ufsmount *, int);
 890 static  void journal_suspend(struct ufsmount *);
 891 static  int journal_unsuspend(struct ufsmount *ump);
 892 static  void add_to_journal(struct worklist *);
 893 static  void remove_from_journal(struct worklist *);
 894 static  bool softdep_excess_items(struct ufsmount *, int);
 895 static  void softdep_process_journal(struct mount *, struct worklist *, int);
 896 static  struct jremref *newjremref(struct dirrem *, struct inode *,
 897             struct inode *ip, off_t, nlink_t);
 898 static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
 899             uint16_t);
 900 static  inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
 901             uint16_t);
 902 static  inline struct jsegdep *inoref_jseg(struct inoref *);
 903 static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
 904 static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
 905             ufs2_daddr_t, int);
 906 static  void adjust_newfreework(struct freeblks *, int);
 907 static  struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
 908 static  void move_newblock_dep(struct jaddref *, struct inodedep *);
 909 static  void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
 910 static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
 911             ufs2_daddr_t, long, ufs_lbn_t);
 912 static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
 913             struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
 914 static  int jwait(struct worklist *, int);
 915 static  struct inodedep *inodedep_lookup_ip(struct inode *);
 916 static  int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
 917 static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
 918 static  void handle_jwork(struct workhead *);
 919 static  struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
 920             struct mkdir **);
 921 static  struct jblocks *jblocks_create(void);
 922 static  ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
 923 static  void jblocks_free(struct jblocks *, struct mount *, int);
 924 static  void jblocks_destroy(struct jblocks *);
 925 static  void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
 926
 927 /*
 928  * Exported softdep operations.
 929  */
 930 static  void softdep_disk_io_initiation(struct buf *);
 931 static  void softdep_disk_write_complete(struct buf *);
 932 static  void softdep_deallocate_dependencies(struct buf *);
 933 static  int softdep_count_dependencies(struct buf *bp, int);
 934
 935 /*
 936  * Global lock over all of soft updates.
 937  */
 938 static struct mtx lk;
 939 MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF);
 940
 941 #define ACQUIRE_GBLLOCK(lk)     mtx_lock(lk)
 942 #define FREE_GBLLOCK(lk)        mtx_unlock(lk)
 943 #define GBLLOCK_OWNED(lk)       mtx_assert((lk), MA_OWNED)
 944
 945 /*
 946  * Per-filesystem soft-updates locking.
 947  */
 948 #define LOCK_PTR(ump)           (&(ump)->um_softdep->sd_fslock)
 949 #define TRY_ACQUIRE_LOCK(ump)   rw_try_wlock(&(ump)->um_softdep->sd_fslock)
 950 #define ACQUIRE_LOCK(ump)       rw_wlock(&(ump)->um_softdep->sd_fslock)
 951 #define FREE_LOCK(ump)          rw_wunlock(&(ump)->um_softdep->sd_fslock)
 952 #define LOCK_OWNED(ump)         rw_assert(&(ump)->um_softdep->sd_fslock, \
 953                                     RA_WLOCKED)
 954
 955 #define BUF_AREC(bp)            lockallowrecurse(&(bp)->b_lock)
 956 #define BUF_NOREC(bp)           lockdisablerecurse(&(bp)->b_lock)
 957
 958 /*
 959  * Worklist queue management.
 960  * These routines require that the lock be held.
 961  */
 962 #ifndef /* NOT */ INVARIANTS
 963 #define WORKLIST_INSERT(head, item) do {        \
 964         (item)->wk_state |= ONWORKLIST;         \
 965         LIST_INSERT_HEAD(head, item, wk_list);  \
 966 } while (0)
 967 #define WORKLIST_REMOVE(item) do {              \
 968         (item)->wk_state &= ~ONWORKLIST;        \
 969         LIST_REMOVE(item, wk_list);             \
 970 } while (0)
 971 #define WORKLIST_INSERT_UNLOCKED        WORKLIST_INSERT
 972 #define WORKLIST_REMOVE_UNLOCKED        WORKLIST_REMOVE
 973
 974 #else /* INVARIANTS */
 975 static  void worklist_insert(struct workhead *, struct worklist *, int,
 976         const char *, int);
 977 static  void worklist_remove(struct worklist *, int, const char *, int);
 978
 979 #define WORKLIST_INSERT(head, item) \
 980         worklist_insert(head, item, 1, __func__, __LINE__)
 981 #define WORKLIST_INSERT_UNLOCKED(head, item)\
 982         worklist_insert(head, item, 0, __func__, __LINE__)
 983 #define WORKLIST_REMOVE(item)\
 984         worklist_remove(item, 1, __func__, __LINE__)
 985 #define WORKLIST_REMOVE_UNLOCKED(item)\
 986         worklist_remove(item, 0, __func__, __LINE__)
 987
 988 static void
 989 worklist_insert(struct workhead *head,
 990         struct worklist *item,
 991         int locked,
 992         const char *func,
 993         int line)
 994 {
 995
 996         if (locked)
 997                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
 998         if (item->wk_state & ONWORKLIST)
 999                 panic("worklist_insert: %p %s(0x%X) already on list, "
1000                     "added in function %s at line %d",
1001                     item, TYPENAME(item->wk_type), item->wk_state,
1002                     item->wk_func, item->wk_line);
1003         item->wk_state |= ONWORKLIST;
1004         item->wk_func = func;
1005         item->wk_line = line;
1006         LIST_INSERT_HEAD(head, item, wk_list);
1007 }
1008
1009 static void
1010 worklist_remove(struct worklist *item,
1011         int locked,
1012         const char *func,
1013         int line)
1014 {
1015
1016         if (locked)
1017                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1018         if ((item->wk_state & ONWORKLIST) == 0)
1019                 panic("worklist_remove: %p %s(0x%X) not on list, "
1020                     "removed in function %s at line %d",
1021                     item, TYPENAME(item->wk_type), item->wk_state,
1022                     item->wk_func, item->wk_line);
1023         item->wk_state &= ~ONWORKLIST;
1024         item->wk_func = func;
1025         item->wk_line = line;
1026         LIST_REMOVE(item, wk_list);
1027 }
1028 #endif /* INVARIANTS */
1029
1030 /*
1031  * Merge two jsegdeps keeping only the oldest one as newer references
1032  * can't be discarded until after older references.
1033  */
1034 static inline struct jsegdep *
1035 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1036 {
1037         struct jsegdep *swp;
1038
1039         if (two == NULL)
1040                 return (one);
1041
1042         if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1043                 swp = one;
1044                 one = two;
1045                 two = swp;
1046         }
1047         WORKLIST_REMOVE(&two->jd_list);
1048         free_jsegdep(two);
1049
1050         return (one);
1051 }
1052
1053 /*
1054  * If two freedeps are compatible free one to reduce list size.
1055  */
1056 static inline struct freedep *
1057 freedep_merge(struct freedep *one, struct freedep *two)
1058 {
1059         if (two == NULL)
1060                 return (one);
1061
1062         if (one->fd_freework == two->fd_freework) {
1063                 WORKLIST_REMOVE(&two->fd_list);
1064                 free_freedep(two);
1065         }
1066         return (one);
1067 }
1068
1069 /*
1070  * Move journal work from one list to another.  Duplicate freedeps and
1071  * jsegdeps are coalesced to keep the lists as small as possible.
1072  */
1073 static void
1074 jwork_move(struct workhead *dst, struct workhead *src)
1075 {
1076         struct freedep *freedep;
1077         struct jsegdep *jsegdep;
1078         struct worklist *wkn;
1079         struct worklist *wk;
1080
1081         KASSERT(dst != src,
1082             ("jwork_move: dst == src"));
1083         freedep = NULL;
1084         jsegdep = NULL;
1085         LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1086                 if (wk->wk_type == D_JSEGDEP)
1087                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1088                 else if (wk->wk_type == D_FREEDEP)
1089                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1090         }
1091
1092         while ((wk = LIST_FIRST(src)) != NULL) {
1093                 WORKLIST_REMOVE(wk);
1094                 WORKLIST_INSERT(dst, wk);
1095                 if (wk->wk_type == D_JSEGDEP) {
1096                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1097                         continue;
1098                 }
1099                 if (wk->wk_type == D_FREEDEP)
1100                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1101         }
1102 }
1103
1104 static void
1105 jwork_insert(struct workhead *dst, struct jsegdep *jsegdep)
1106 {
1107         struct jsegdep *jsegdepn;
1108         struct worklist *wk;
1109
1110         LIST_FOREACH(wk, dst, wk_list)
1111                 if (wk->wk_type == D_JSEGDEP)
1112                         break;
1113         if (wk == NULL) {
1114                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1115                 return;
1116         }
1117         jsegdepn = WK_JSEGDEP(wk);
1118         if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1119                 WORKLIST_REMOVE(wk);
1120                 free_jsegdep(jsegdepn);
1121                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1122         } else
1123                 free_jsegdep(jsegdep);
1124 }
1125
1126 /*
1127  * Routines for tracking and managing workitems.
1128  */
1129 static  void workitem_free(struct worklist *, int);
1130 static  void workitem_alloc(struct worklist *, int, struct mount *);
1131 static  void workitem_reassign(struct worklist *, int);
1132
1133 #define WORKITEM_FREE(item, type) \
1134         workitem_free((struct worklist *)(item), (type))
1135 #define WORKITEM_REASSIGN(item, type) \
1136         workitem_reassign((struct worklist *)(item), (type))
1137
1138 static void
1139 workitem_free(struct worklist *item, int type)
1140 {
1141         struct ufsmount *ump;
1142
1143 #ifdef INVARIANTS
1144         if (item->wk_state & ONWORKLIST)
1145                 panic("workitem_free: %s(0x%X) still on list, "
1146                     "added in function %s at line %d",
1147                     TYPENAME(item->wk_type), item->wk_state,
1148                     item->wk_func, item->wk_line);
1149         if (item->wk_type != type && type != D_NEWBLK)
1150                 panic("workitem_free: type mismatch %s != %s",
1151                     TYPENAME(item->wk_type), TYPENAME(type));
1152 #endif
1153         if (item->wk_state & IOWAITING)
1154                 wakeup(item);
1155         ump = VFSTOUFS(item->wk_mp);
1156         LOCK_OWNED(ump);
1157         KASSERT(ump->softdep_deps > 0,
1158             ("workitem_free: %s: softdep_deps going negative",
1159             ump->um_fs->fs_fsmnt));
1160         if (--ump->softdep_deps == 0 && ump->softdep_req)
1161                 wakeup(&ump->softdep_deps);
1162         KASSERT(dep_current[item->wk_type] > 0,
1163             ("workitem_free: %s: dep_current[%s] going negative",
1164             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1165         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1166             ("workitem_free: %s: softdep_curdeps[%s] going negative",
1167             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1168         atomic_subtract_long(&dep_current[item->wk_type], 1);
1169         ump->softdep_curdeps[item->wk_type] -= 1;
1170         LIST_REMOVE(item, wk_all);
1171         free(item, DtoM(type));
1172 }
1173
1174 static void
1175 workitem_alloc(struct worklist *item,
1176         int type,
1177         struct mount *mp)
1178 {
1179         struct ufsmount *ump;
1180
1181         item->wk_type = type;
1182         item->wk_mp = mp;
1183         item->wk_state = 0;
1184
1185         ump = VFSTOUFS(mp);
1186         ACQUIRE_GBLLOCK(&lk);
1187         dep_current[type]++;
1188         if (dep_current[type] > dep_highuse[type])
1189                 dep_highuse[type] = dep_current[type];
1190         dep_total[type]++;
1191         FREE_GBLLOCK(&lk);
1192         ACQUIRE_LOCK(ump);
1193         ump->softdep_curdeps[type] += 1;
1194         ump->softdep_deps++;
1195         ump->softdep_accdeps++;
1196         LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all);
1197         FREE_LOCK(ump);
1198 }
1199
1200 static void
1201 workitem_reassign(struct worklist *item, int newtype)
1202 {
1203         struct ufsmount *ump;
1204
1205         ump = VFSTOUFS(item->wk_mp);
1206         LOCK_OWNED(ump);
1207         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1208             ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1209             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1210         ump->softdep_curdeps[item->wk_type] -= 1;
1211         ump->softdep_curdeps[newtype] += 1;
1212         KASSERT(dep_current[item->wk_type] > 0,
1213             ("workitem_reassign: %s: dep_current[%s] going negative",
1214             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1215         ACQUIRE_GBLLOCK(&lk);
1216         dep_current[newtype]++;
1217         dep_current[item->wk_type]--;
1218         if (dep_current[newtype] > dep_highuse[newtype])
1219                 dep_highuse[newtype] = dep_current[newtype];
1220         dep_total[newtype]++;
1221         FREE_GBLLOCK(&lk);
1222         item->wk_type = newtype;
1223         LIST_REMOVE(item, wk_all);
1224         LIST_INSERT_HEAD(&ump->softdep_alldeps[newtype], item, wk_all);
1225 }
1226
1227 /*
1228  * Workitem queue management
1229  */
1230 static int max_softdeps;        /* maximum number of structs before slowdown */
1231 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
1232 static int proc_waiting;        /* tracks whether we have a timeout posted */
1233 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
1234 static struct callout softdep_callout;
1235 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
1236 static int req_clear_remove;    /* syncer process flush some freeblks */
1237 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1238
1239 /*
1240  * runtime statistics
1241  */
1242 static int stat_flush_threads;  /* number of softdep flushing threads */
1243 static int stat_worklist_push;  /* number of worklist cleanups */
1244 static int stat_delayed_inact;  /* number of delayed inactivation cleanups */
1245 static int stat_blk_limit_push; /* number of times block limit neared */
1246 static int stat_ino_limit_push; /* number of times inode limit neared */
1247 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
1248 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
1249 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
1250 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
1251 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
1252 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1253 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
1254 static int stat_jaddref;        /* bufs redirtied as ino bitmap can not write */
1255 static int stat_jnewblk;        /* bufs redirtied as blk bitmap can not write */
1256 static int stat_journal_min;    /* Times hit journal min threshold */
1257 static int stat_journal_low;    /* Times hit journal low threshold */
1258 static int stat_journal_wait;   /* Times blocked in jwait(). */
1259 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
1260 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
1261 static int stat_jwait_inode;    /* Times blocked in jwait() for inodes. */
1262 static int stat_jwait_newblk;   /* Times blocked in jwait() for newblks. */
1263 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1264 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1265 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1266 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1267 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1268 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1269
1270 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1271     &max_softdeps, 0, "");
1272 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1273     &tickdelay, 0, "");
1274 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1275     &stat_flush_threads, 0, "");
1276 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push,
1277     CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,"");
1278 SYSCTL_INT(_debug_softdep, OID_AUTO, delayed_inactivations, CTLFLAG_RD,
1279     &stat_delayed_inact, 0, "");
1280 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push,
1281     CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,"");
1282 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push,
1283     CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,"");
1284 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit,
1285     CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, "");
1286 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit,
1287     CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, "");
1288 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit,
1289     CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, "");
1290 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs,
1291     CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, "");
1292 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap,
1293     CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, "");
1294 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs,
1295     CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, "");
1296 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry,
1297     CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, "");
1298 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback,
1299     CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, "");
1300 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback,
1301     CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, "");
1302 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low,
1303     CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, "");
1304 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min,
1305     CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, "");
1306 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait,
1307     CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, "");
1308 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage,
1309     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, "");
1310 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks,
1311     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, "");
1312 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode,
1313     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, "");
1314 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk,
1315     CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, "");
1316 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests,
1317     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, "");
1318 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests,
1319     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, "");
1320 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay,
1321     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, "");
1322 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries,
1323     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, "");
1324 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures,
1325     CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, "");
1326
1327 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1328     &softdep_flushcache, 0, "");
1329 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1330     &stat_emptyjblocks, 0, "");
1331
1332 SYSCTL_DECL(_vfs_ffs);
1333
1334 /* Whether to recompute the summary at mount time */
1335 static int compute_summary_at_mount = 0;
1336 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1337            &compute_summary_at_mount, 0, "Recompute summary at mount");
1338 static int print_threads = 0;
1339 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1340     &print_threads, 0, "Notify flusher thread start/stop");
1341
1342 /* List of all filesystems mounted with soft updates */
1343 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1344
1345 static void
1346 get_parent_vp_unlock_bp(struct mount *mp,
1347         struct buf *bp,
1348         struct diraddhd *diraddhdp,
1349         struct diraddhd *unfinishedp)
1350 {
1351         struct diradd *dap;
1352
1353         /*
1354          * Requeue unfinished dependencies before
1355          * unlocking buffer, which could make
1356          * diraddhdp invalid.
1357          */
1358         ACQUIRE_LOCK(VFSTOUFS(mp));
1359         while ((dap = LIST_FIRST(unfinishedp)) != NULL) {
1360                 LIST_REMOVE(dap, da_pdlist);
1361                 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
1362         }
1363         FREE_LOCK(VFSTOUFS(mp));
1364
1365         bp->b_vflags &= ~BV_SCANNED;
1366         BUF_NOREC(bp);
1367         BUF_UNLOCK(bp);
1368 }
1369
1370 /*
1371  * This function fetches inode inum on mount point mp.  We already
1372  * hold a locked vnode vp, and might have a locked buffer bp belonging
1373  * to vp.
1374
1375  * We must not block on acquiring the new inode lock as we will get
1376  * into a lock-order reversal with the buffer lock and possibly get a
1377  * deadlock.  Thus if we cannot instantiate the requested vnode
1378  * without sleeping on its lock, we must unlock the vnode and the
1379  * buffer before doing a blocking on the vnode lock.  We return
1380  * ERELOOKUP if we have had to unlock either the vnode or the buffer so
1381  * that the caller can reassess its state.
1382  *
1383  * Top-level VFS code (for syscalls and other consumers, e.g. callers
1384  * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe
1385  * point.
1386  *
1387  * Since callers expect to operate on fully constructed vnode, we also
1388  * recheck v_data after relock, and return ENOENT if NULL.
1389  *
1390  * If unlocking bp, we must unroll dequeueing its unfinished
1391  * dependencies, and clear scan flag, before unlocking.  If unlocking
1392  * vp while it is under deactivation, we re-queue deactivation.
1393  */
1394 static int
1395 get_parent_vp(struct vnode *vp,
1396         struct mount *mp,
1397         ino_t inum,
1398         struct buf *bp,
1399         struct diraddhd *diraddhdp,
1400         struct diraddhd *unfinishedp,
1401         struct vnode **rvp)
1402 {
1403         struct vnode *pvp;
1404         int error;
1405         bool bplocked;
1406
1407         ASSERT_VOP_ELOCKED(vp, "child vnode must be locked");
1408         for (bplocked = true, pvp = NULL;;) {
1409                 error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp,
1410                     FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP);
1411                 if (error == 0) {
1412                         /*
1413                          * Since we could have unlocked vp, the inode
1414                          * number could no longer indicate a
1415                          * constructed node.  In this case, we must
1416                          * restart the syscall.
1417                          */
1418                         if (VTOI(pvp)->i_mode == 0 || !bplocked) {
1419                                 if (bp != NULL && bplocked)
1420                                         get_parent_vp_unlock_bp(mp, bp,
1421                                             diraddhdp, unfinishedp);
1422                                 if (VTOI(pvp)->i_mode == 0)
1423                                         vgone(pvp);
1424                                 error = ERELOOKUP;
1425                                 goto out2;
1426                         }
1427                         goto out1;
1428                 }
1429                 if (bp != NULL && bplocked) {
1430                         get_parent_vp_unlock_bp(mp, bp, diraddhdp, unfinishedp);
1431                         bplocked = false;
1432                 }
1433
1434                 /*
1435                  * Do not drop vnode lock while inactivating during
1436                  * vunref.  This would result in leaks of the VI flags
1437                  * and reclaiming of non-truncated vnode.  Instead,
1438                  * re-schedule inactivation hoping that we would be
1439                  * able to sync inode later.
1440                  */
1441                 if ((vp->v_iflag & VI_DOINGINACT) != 0 &&
1442                     (vp->v_vflag & VV_UNREF) != 0) {
1443                         VI_LOCK(vp);
1444                         vp->v_iflag |= VI_OWEINACT;
1445                         VI_UNLOCK(vp);
1446                         return (ERELOOKUP);
1447                 }
1448
1449                 VOP_UNLOCK(vp);
1450                 error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp,
1451                     FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP);
1452                 if (error != 0) {
1453                         MPASS(error != ERELOOKUP);
1454                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1455                         break;
1456                 }
1457                 if (VTOI(pvp)->i_mode == 0) {
1458                         vgone(pvp);
1459                         vput(pvp);
1460                         pvp = NULL;
1461                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1462                         error = ERELOOKUP;
1463                         break;
1464                 }
1465                 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
1466                 if (error == 0)
1467                         break;
1468                 vput(pvp);
1469                 pvp = NULL;
1470                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1471                 if (vp->v_data == NULL) {
1472                         error = ENOENT;
1473                         break;
1474                 }
1475         }
1476         if (bp != NULL) {
1477                 MPASS(!bplocked);
1478                 error = ERELOOKUP;
1479         }
1480 out2:
1481         if (error != 0 && pvp != NULL) {
1482                 vput(pvp);
1483                 pvp = NULL;
1484         }
1485 out1:
1486         *rvp = pvp;
1487         ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return");
1488         return (error);
1489 }
1490
1491 /*
1492  * This function cleans the worklist for a filesystem.
1493  * Each filesystem running with soft dependencies gets its own
1494  * thread to run in this function. The thread is started up in
1495  * softdep_mount and shutdown in softdep_unmount. They show up
1496  * as part of the kernel "bufdaemon" process whose process
1497  * entry is available in bufdaemonproc.
1498  */
1499 static int searchfailed;
1500 extern struct proc *bufdaemonproc;
1501 static void
1502 softdep_flush(void *addr)
1503 {
1504         struct mount *mp;
1505         struct thread *td;
1506         struct ufsmount *ump;
1507         int cleanups;
1508
1509         td = curthread;
1510         td->td_pflags |= TDP_NORUNNINGBUF;
1511         mp = (struct mount *)addr;
1512         ump = VFSTOUFS(mp);
1513         atomic_add_int(&stat_flush_threads, 1);
1514         ACQUIRE_LOCK(ump);
1515         ump->softdep_flags &= ~FLUSH_STARTING;
1516         wakeup(&ump->softdep_flushtd);
1517         FREE_LOCK(ump);
1518         if (print_threads) {
1519                 if (stat_flush_threads == 1)
1520                         printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1521                             bufdaemonproc->p_pid);
1522                 printf("Start thread %s\n", td->td_name);
1523         }
1524         for (;;) {
1525                 while (softdep_process_worklist(mp, 0) > 0 ||
1526                     (MOUNTEDSUJ(mp) &&
1527                     VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1528                         kthread_suspend_check();
1529                 ACQUIRE_LOCK(ump);
1530                 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1531                         msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1532                             "sdflush", hz / 2);
1533                 ump->softdep_flags &= ~FLUSH_CLEANUP;
1534                 /*
1535                  * Check to see if we are done and need to exit.
1536                  */
1537                 if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1538                         FREE_LOCK(ump);
1539                         continue;
1540                 }
1541                 ump->softdep_flags &= ~FLUSH_EXIT;
1542                 cleanups = ump->um_softdep->sd_cleanups;
1543                 FREE_LOCK(ump);
1544                 wakeup(&ump->softdep_flags);
1545                 if (print_threads) {
1546                         printf("Stop thread %s: searchfailed %d, "
1547                             "did cleanups %d\n",
1548                             td->td_name, searchfailed, cleanups);
1549                 }
1550                 atomic_subtract_int(&stat_flush_threads, 1);
1551                 kthread_exit();
1552                 panic("kthread_exit failed\n");
1553         }
1554 }
1555
1556 static void
1557 worklist_speedup(struct mount *mp)
1558 {
1559         struct ufsmount *ump;
1560
1561         ump = VFSTOUFS(mp);
1562         LOCK_OWNED(ump);
1563         if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1564                 ump->softdep_flags |= FLUSH_CLEANUP;
1565         wakeup(&ump->softdep_flushtd);
1566 }
1567
1568 static void
1569 softdep_send_speedup(struct ufsmount *ump,
1570         off_t shortage,
1571         uint64_t flags)
1572 {
1573         struct buf *bp;
1574
1575         if ((ump->um_flags & UM_CANSPEEDUP) == 0)
1576                 return;
1577
1578         bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
1579         bp->b_iocmd = BIO_SPEEDUP;
1580         bp->b_ioflags = flags;
1581         bp->b_bcount = omin(shortage, LONG_MAX);
1582         g_vfs_strategy(ump->um_bo, bp);
1583         bufwait(bp);
1584         free(bp, M_TRIM);
1585 }
1586
1587 static int
1588 softdep_speedup(struct ufsmount *ump)
1589 {
1590         struct ufsmount *altump;
1591         struct mount_softdeps *sdp;
1592
1593         LOCK_OWNED(ump);
1594         worklist_speedup(ump->um_mountp);
1595         bd_speedup();
1596         /*
1597          * If we have global shortages, then we need other
1598          * filesystems to help with the cleanup. Here we wakeup a
1599          * flusher thread for a filesystem that is over its fair
1600          * share of resources.
1601          */
1602         if (req_clear_inodedeps || req_clear_remove) {
1603                 ACQUIRE_GBLLOCK(&lk);
1604                 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1605                         if ((altump = sdp->sd_ump) == ump)
1606                                 continue;
1607                         if (((req_clear_inodedeps &&
1608                             altump->softdep_curdeps[D_INODEDEP] >
1609                             max_softdeps / stat_flush_threads) ||
1610                             (req_clear_remove &&
1611                             altump->softdep_curdeps[D_DIRREM] >
1612                             (max_softdeps / 2) / stat_flush_threads)) &&
1613                             TRY_ACQUIRE_LOCK(altump))
1614                                 break;
1615                 }
1616                 if (sdp == NULL) {
1617                         searchfailed++;
1618                         FREE_GBLLOCK(&lk);
1619                 } else {
1620                         /*
1621                          * Move to the end of the list so we pick a
1622                          * different one on out next try.
1623                          */
1624                         TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1625                         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1626                         FREE_GBLLOCK(&lk);
1627                         if ((altump->softdep_flags &
1628                             (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1629                                 altump->softdep_flags |= FLUSH_CLEANUP;
1630                         altump->um_softdep->sd_cleanups++;
1631                         wakeup(&altump->softdep_flushtd);
1632                         FREE_LOCK(altump);
1633                 }
1634         }
1635         return (speedup_syncer());
1636 }
1637
1638 /*
1639  * Add an item to the end of the work queue.
1640  * This routine requires that the lock be held.
1641  * This is the only routine that adds items to the list.
1642  * The following routine is the only one that removes items
1643  * and does so in order from first to last.
1644  */
1645
1646 #define WK_HEAD         0x0001  /* Add to HEAD. */
1647 #define WK_NODELAY      0x0002  /* Process immediately. */
1648
1649 static void
1650 add_to_worklist(struct worklist *wk, int flags)
1651 {
1652         struct ufsmount *ump;
1653
1654         ump = VFSTOUFS(wk->wk_mp);
1655         LOCK_OWNED(ump);
1656         if (wk->wk_state & ONWORKLIST)
1657                 panic("add_to_worklist: %s(0x%X) already on list",
1658                     TYPENAME(wk->wk_type), wk->wk_state);
1659         wk->wk_state |= ONWORKLIST;
1660         if (ump->softdep_on_worklist == 0) {
1661                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1662                 ump->softdep_worklist_tail = wk;
1663         } else if (flags & WK_HEAD) {
1664                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1665         } else {
1666                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1667                 ump->softdep_worklist_tail = wk;
1668         }
1669         ump->softdep_on_worklist += 1;
1670         if (flags & WK_NODELAY)
1671                 worklist_speedup(wk->wk_mp);
1672 }
1673
1674 /*
1675  * Remove the item to be processed. If we are removing the last
1676  * item on the list, we need to recalculate the tail pointer.
1677  */
1678 static void
1679 remove_from_worklist(struct worklist *wk)
1680 {
1681         struct ufsmount *ump;
1682
1683         ump = VFSTOUFS(wk->wk_mp);
1684         if (ump->softdep_worklist_tail == wk)
1685                 ump->softdep_worklist_tail =
1686                     (struct worklist *)wk->wk_list.le_prev;
1687         WORKLIST_REMOVE(wk);
1688         ump->softdep_on_worklist -= 1;
1689 }
1690
1691 static void
1692 wake_worklist(struct worklist *wk)
1693 {
1694         if (wk->wk_state & IOWAITING) {
1695                 wk->wk_state &= ~IOWAITING;
1696                 wakeup(wk);
1697         }
1698 }
1699
1700 static void
1701 wait_worklist(struct worklist *wk, char *wmesg)
1702 {
1703         struct ufsmount *ump;
1704
1705         ump = VFSTOUFS(wk->wk_mp);
1706         wk->wk_state |= IOWAITING;
1707         msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1708 }
1709
1710 /*
1711  * Process that runs once per second to handle items in the background queue.
1712  *
1713  * Note that we ensure that everything is done in the order in which they
1714  * appear in the queue. The code below depends on this property to ensure
1715  * that blocks of a file are freed before the inode itself is freed. This
1716  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1717  * until all the old ones have been purged from the dependency lists.
1718  */
1719 static int
1720 softdep_process_worklist(struct mount *mp, int full)
1721 {
1722         int cnt, matchcnt;
1723         struct ufsmount *ump;
1724         long starttime;
1725
1726         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1727         ump = VFSTOUFS(mp);
1728         if (ump->um_softdep == NULL)
1729                 return (0);
1730         matchcnt = 0;
1731         ACQUIRE_LOCK(ump);
1732         starttime = time_second;
1733         softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1734         check_clear_deps(mp);
1735         while (ump->softdep_on_worklist > 0) {
1736                 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1737                         break;
1738                 else
1739                         matchcnt += cnt;
1740                 check_clear_deps(mp);
1741                 /*
1742                  * We do not generally want to stop for buffer space, but if
1743                  * we are really being a buffer hog, we will stop and wait.
1744                  */
1745                 if (should_yield()) {
1746                         FREE_LOCK(ump);
1747                         kern_yield(PRI_USER);
1748                         bwillwrite();
1749                         ACQUIRE_LOCK(ump);
1750                 }
1751                 /*
1752                  * Never allow processing to run for more than one
1753                  * second. This gives the syncer thread the opportunity
1754                  * to pause if appropriate.
1755                  */
1756                 if (!full && starttime != time_second)
1757                         break;
1758         }
1759         if (full == 0)
1760                 journal_unsuspend(ump);
1761         FREE_LOCK(ump);
1762         return (matchcnt);
1763 }
1764
1765 /*
1766  * Process all removes associated with a vnode if we are running out of
1767  * journal space.  Any other process which attempts to flush these will
1768  * be unable as we have the vnodes locked.
1769  */
1770 static void
1771 process_removes(struct vnode *vp)
1772 {
1773         struct inodedep *inodedep;
1774         struct dirrem *dirrem;
1775         struct ufsmount *ump;
1776         struct mount *mp;
1777         ino_t inum;
1778
1779         mp = vp->v_mount;
1780         ump = VFSTOUFS(mp);
1781         LOCK_OWNED(ump);
1782         inum = VTOI(vp)->i_number;
1783         for (;;) {
1784 top:
1785                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1786                         return;
1787                 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1788                         /*
1789                          * If another thread is trying to lock this vnode
1790                          * it will fail but we must wait for it to do so
1791                          * before we can proceed.
1792                          */
1793                         if (dirrem->dm_state & INPROGRESS) {
1794                                 wait_worklist(&dirrem->dm_list, "pwrwait");
1795                                 goto top;
1796                         }
1797                         if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1798                             (COMPLETE | ONWORKLIST))
1799                                 break;
1800                 }
1801                 if (dirrem == NULL)
1802                         return;
1803                 remove_from_worklist(&dirrem->dm_list);
1804                 FREE_LOCK(ump);
1805                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1806                         panic("process_removes: suspended filesystem");
1807                 handle_workitem_remove(dirrem, 0);
1808                 vn_finished_secondary_write(mp);
1809                 ACQUIRE_LOCK(ump);
1810         }
1811 }
1812
1813 /*
1814  * Process all truncations associated with a vnode if we are running out
1815  * of journal space.  This is called when the vnode lock is already held
1816  * and no other process can clear the truncation.  This function returns
1817  * a value greater than zero if it did any work.
1818  */
1819 static void
1820 process_truncates(struct vnode *vp)
1821 {
1822         struct inodedep *inodedep;
1823         struct freeblks *freeblks;
1824         struct ufsmount *ump;
1825         struct mount *mp;
1826         ino_t inum;
1827         int cgwait;
1828
1829         mp = vp->v_mount;
1830         ump = VFSTOUFS(mp);
1831         LOCK_OWNED(ump);
1832         inum = VTOI(vp)->i_number;
1833         for (;;) {
1834                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1835                         return;
1836                 cgwait = 0;
1837                 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1838                         /* Journal entries not yet written.  */
1839                         if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1840                                 jwait(&LIST_FIRST(
1841                                     &freeblks->fb_jblkdephd)->jb_list,
1842                                     MNT_WAIT);
1843                                 break;
1844                         }
1845                         /* Another thread is executing this item. */
1846                         if (freeblks->fb_state & INPROGRESS) {
1847                                 wait_worklist(&freeblks->fb_list, "ptrwait");
1848                                 break;
1849                         }
1850                         /* Freeblks is waiting on a inode write. */
1851                         if ((freeblks->fb_state & COMPLETE) == 0) {
1852                                 FREE_LOCK(ump);
1853                                 ffs_update(vp, 1);
1854                                 ACQUIRE_LOCK(ump);
1855                                 break;
1856                         }
1857                         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1858                             (ALLCOMPLETE | ONWORKLIST)) {
1859                                 remove_from_worklist(&freeblks->fb_list);
1860                                 freeblks->fb_state |= INPROGRESS;
1861                                 FREE_LOCK(ump);
1862                                 if (vn_start_secondary_write(NULL, &mp,
1863                                     V_NOWAIT))
1864                                         panic("process_truncates: "
1865                                             "suspended filesystem");
1866                                 handle_workitem_freeblocks(freeblks, 0);
1867                                 vn_finished_secondary_write(mp);
1868                                 ACQUIRE_LOCK(ump);
1869                                 break;
1870                         }
1871                         if (freeblks->fb_cgwait)
1872                                 cgwait++;
1873                 }
1874                 if (cgwait) {
1875                         FREE_LOCK(ump);
1876                         sync_cgs(mp, MNT_WAIT);
1877                         ffs_sync_snap(mp, MNT_WAIT);
1878                         ACQUIRE_LOCK(ump);
1879                         continue;
1880                 }
1881                 if (freeblks == NULL)
1882                         break;
1883         }
1884         return;
1885 }
1886
1887 /*
1888  * Process one item on the worklist.
1889  */
1890 static int
1891 process_worklist_item(struct mount *mp,
1892         int target,
1893         int flags)
1894 {
1895         struct worklist sentinel;
1896         struct worklist *wk;
1897         struct ufsmount *ump;
1898         int matchcnt;
1899         int error;
1900
1901         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1902         /*
1903          * If we are being called because of a process doing a
1904          * copy-on-write, then it is not safe to write as we may
1905          * recurse into the copy-on-write routine.
1906          */
1907         if (curthread->td_pflags & TDP_COWINPROGRESS)
1908                 return (-1);
1909         PHOLD(curproc); /* Don't let the stack go away. */
1910         ump = VFSTOUFS(mp);
1911         LOCK_OWNED(ump);
1912         matchcnt = 0;
1913         sentinel.wk_mp = NULL;
1914         sentinel.wk_type = D_SENTINEL;
1915         LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1916         for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1917             wk = LIST_NEXT(&sentinel, wk_list)) {
1918                 if (wk->wk_type == D_SENTINEL) {
1919                         LIST_REMOVE(&sentinel, wk_list);
1920                         LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1921                         continue;
1922                 }
1923                 if (wk->wk_state & INPROGRESS)
1924                         panic("process_worklist_item: %p already in progress.",
1925                             wk);
1926                 wk->wk_state |= INPROGRESS;
1927                 remove_from_worklist(wk);
1928                 FREE_LOCK(ump);
1929                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1930                         panic("process_worklist_item: suspended filesystem");
1931                 switch (wk->wk_type) {
1932                 case D_DIRREM:
1933                         /* removal of a directory entry */
1934                         error = handle_workitem_remove(WK_DIRREM(wk), flags);
1935                         break;
1936
1937                 case D_FREEBLKS:
1938                         /* releasing blocks and/or fragments from a file */
1939                         error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1940                             flags);
1941                         break;
1942
1943                 case D_FREEFRAG:
1944                         /* releasing a fragment when replaced as a file grows */
1945                         handle_workitem_freefrag(WK_FREEFRAG(wk));
1946                         error = 0;
1947                         break;
1948
1949                 case D_FREEFILE:
1950                         /* releasing an inode when its link count drops to 0 */
1951                         handle_workitem_freefile(WK_FREEFILE(wk));
1952                         error = 0;
1953                         break;
1954
1955                 default:
1956                         panic("%s_process_worklist: Unknown type %s",
1957                             "softdep", TYPENAME(wk->wk_type));
1958                         /* NOTREACHED */
1959                 }
1960                 vn_finished_secondary_write(mp);
1961                 ACQUIRE_LOCK(ump);
1962                 if (error == 0) {
1963                         if (++matchcnt == target)
1964                                 break;
1965                         continue;
1966                 }
1967                 /*
1968                  * We have to retry the worklist item later.  Wake up any
1969                  * waiters who may be able to complete it immediately and
1970                  * add the item back to the head so we don't try to execute
1971                  * it again.
1972                  */
1973                 wk->wk_state &= ~INPROGRESS;
1974                 wake_worklist(wk);
1975                 add_to_worklist(wk, WK_HEAD);
1976         }
1977         /* Sentinal could've become the tail from remove_from_worklist. */
1978         if (ump->softdep_worklist_tail == &sentinel)
1979                 ump->softdep_worklist_tail =
1980                     (struct worklist *)sentinel.wk_list.le_prev;
1981         LIST_REMOVE(&sentinel, wk_list);
1982         PRELE(curproc);
1983         return (matchcnt);
1984 }
1985
1986 /*
1987  * Move dependencies from one buffer to another.
1988  */
1989 int
1990 softdep_move_dependencies(struct buf *oldbp, struct buf *newbp)
1991 {
1992         struct worklist *wk, *wktail;
1993         struct ufsmount *ump;
1994         int dirty;
1995
1996         if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1997                 return (0);
1998         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1999             ("softdep_move_dependencies called on non-softdep filesystem"));
2000         dirty = 0;
2001         wktail = NULL;
2002         ump = VFSTOUFS(wk->wk_mp);
2003         ACQUIRE_LOCK(ump);
2004         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
2005                 LIST_REMOVE(wk, wk_list);
2006                 if (wk->wk_type == D_BMSAFEMAP &&
2007                     bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
2008                         dirty = 1;
2009                 if (wktail == NULL)
2010                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
2011                 else
2012                         LIST_INSERT_AFTER(wktail, wk, wk_list);
2013                 wktail = wk;
2014         }
2015         FREE_LOCK(ump);
2016
2017         return (dirty);
2018 }
2019
2020 /*
2021  * Purge the work list of all items associated with a particular mount point.
2022  */
2023 int
2024 softdep_flushworklist(struct mount *oldmnt,
2025         int *countp,
2026         struct thread *td)
2027 {
2028         struct vnode *devvp;
2029         struct ufsmount *ump;
2030         int count, error;
2031
2032         /*
2033          * Alternately flush the block device associated with the mount
2034          * point and process any dependencies that the flushing
2035          * creates. We continue until no more worklist dependencies
2036          * are found.
2037          */
2038         *countp = 0;
2039         error = 0;
2040         ump = VFSTOUFS(oldmnt);
2041         devvp = ump->um_devvp;
2042         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
2043                 *countp += count;
2044                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2045                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
2046                 VOP_UNLOCK(devvp);
2047                 if (error != 0)
2048                         break;
2049         }
2050         return (error);
2051 }
2052
2053 #define SU_WAITIDLE_RETRIES     20
2054 static int
2055 softdep_waitidle(struct mount *mp, int flags __unused)
2056 {
2057         struct ufsmount *ump;
2058         struct vnode *devvp;
2059         struct thread *td;
2060         int error, i;
2061
2062         ump = VFSTOUFS(mp);
2063         KASSERT(ump->um_softdep != NULL,
2064             ("softdep_waitidle called on non-softdep filesystem"));
2065         devvp = ump->um_devvp;
2066         td = curthread;
2067         error = 0;
2068         ACQUIRE_LOCK(ump);
2069         for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
2070                 ump->softdep_req = 1;
2071                 KASSERT((flags & FORCECLOSE) == 0 ||
2072                     ump->softdep_on_worklist == 0,
2073                     ("softdep_waitidle: work added after flush"));
2074                 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
2075                     "softdeps", 10 * hz);
2076                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2077                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
2078                 VOP_UNLOCK(devvp);
2079                 ACQUIRE_LOCK(ump);
2080                 if (error != 0)
2081                         break;
2082         }
2083         ump->softdep_req = 0;
2084         if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
2085                 error = EBUSY;
2086                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
2087                     mp);
2088         }
2089         FREE_LOCK(ump);
2090         return (error);
2091 }
2092
2093 /*
2094  * Flush all vnodes and worklist items associated with a specified mount point.
2095  */
2096 int
2097 softdep_flushfiles(struct mount *oldmnt,
2098         int flags,
2099         struct thread *td)
2100 {
2101         struct ufsmount *ump;
2102 #ifdef QUOTA
2103         int i;
2104 #endif
2105         int error, early, depcount, loopcnt, retry_flush_count, retry;
2106         int morework;
2107
2108         ump = VFSTOUFS(oldmnt);
2109         KASSERT(ump->um_softdep != NULL,
2110             ("softdep_flushfiles called on non-softdep filesystem"));
2111         loopcnt = 10;
2112         retry_flush_count = 3;
2113 retry_flush:
2114         error = 0;
2115
2116         /*
2117          * Alternately flush the vnodes associated with the mount
2118          * point and process any dependencies that the flushing
2119          * creates. In theory, this loop can happen at most twice,
2120          * but we give it a few extra just to be sure.
2121          */
2122         for (; loopcnt > 0; loopcnt--) {
2123                 /*
2124                  * Do another flush in case any vnodes were brought in
2125                  * as part of the cleanup operations.
2126                  */
2127                 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
2128                     MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
2129                 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
2130                         break;
2131                 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
2132                     depcount == 0)
2133                         break;
2134         }
2135         /*
2136          * If we are unmounting then it is an error to fail. If we
2137          * are simply trying to downgrade to read-only, then filesystem
2138          * activity can keep us busy forever, so we just fail with EBUSY.
2139          */
2140         if (loopcnt == 0) {
2141                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2142                         panic("softdep_flushfiles: looping");
2143                 error = EBUSY;
2144         }
2145         if (!error)
2146                 error = softdep_waitidle(oldmnt, flags);
2147         if (!error) {
2148                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2149                         retry = 0;
2150                         MNT_ILOCK(oldmnt);
2151                         morework = oldmnt->mnt_nvnodelistsize > 0;
2152 #ifdef QUOTA
2153                         UFS_LOCK(ump);
2154                         for (i = 0; i < MAXQUOTAS; i++) {
2155                                 if (ump->um_quotas[i] != NULLVP)
2156                                         morework = 1;
2157                         }
2158                         UFS_UNLOCK(ump);
2159 #endif
2160                         if (morework) {
2161                                 if (--retry_flush_count > 0) {
2162                                         retry = 1;
2163                                         loopcnt = 3;
2164                                 } else
2165                                         error = EBUSY;
2166                         }
2167                         MNT_IUNLOCK(oldmnt);
2168                         if (retry)
2169                                 goto retry_flush;
2170                 }
2171         }
2172         return (error);
2173 }
2174
2175 /*
2176  * Structure hashing.
2177  *
2178  * There are four types of structures that can be looked up:
2179  *      1) pagedep structures identified by mount point, inode number,
2180  *         and logical block.
2181  *      2) inodedep structures identified by mount point and inode number.
2182  *      3) newblk structures identified by mount point and
2183  *         physical block number.
2184  *      4) bmsafemap structures identified by mount point and
2185  *         cylinder group number.
2186  *
2187  * The "pagedep" and "inodedep" dependency structures are hashed
2188  * separately from the file blocks and inodes to which they correspond.
2189  * This separation helps when the in-memory copy of an inode or
2190  * file block must be replaced. It also obviates the need to access
2191  * an inode or file page when simply updating (or de-allocating)
2192  * dependency structures. Lookup of newblk structures is needed to
2193  * find newly allocated blocks when trying to associate them with
2194  * their allocdirect or allocindir structure.
2195  *
2196  * The lookup routines optionally create and hash a new instance when
2197  * an existing entry is not found. The bmsafemap lookup routine always
2198  * allocates a new structure if an existing one is not found.
2199  */
2200 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
2201
2202 /*
2203  * Structures and routines associated with pagedep caching.
2204  */
2205 #define PAGEDEP_HASH(ump, inum, lbn) \
2206         (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2207
2208 static int
2209 pagedep_find(struct pagedep_hashhead *pagedephd,
2210         ino_t ino,
2211         ufs_lbn_t lbn,
2212         struct pagedep **pagedeppp)
2213 {
2214         struct pagedep *pagedep;
2215
2216         LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2217                 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2218                         *pagedeppp = pagedep;
2219                         return (1);
2220                 }
2221         }
2222         *pagedeppp = NULL;
2223         return (0);
2224 }
2225 /*
2226  * Look up a pagedep. Return 1 if found, 0 otherwise.
2227  * If not found, allocate if DEPALLOC flag is passed.
2228  * Found or allocated entry is returned in pagedeppp.
2229  */
2230 static int
2231 pagedep_lookup(struct mount *mp,
2232         struct buf *bp,
2233         ino_t ino,
2234         ufs_lbn_t lbn,
2235         int flags,
2236         struct pagedep **pagedeppp)
2237 {
2238         struct pagedep *pagedep;
2239         struct pagedep_hashhead *pagedephd;
2240         struct worklist *wk;
2241         struct ufsmount *ump;
2242         int ret;
2243         int i;
2244
2245         ump = VFSTOUFS(mp);
2246         LOCK_OWNED(ump);
2247         if (bp) {
2248                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2249                         if (wk->wk_type == D_PAGEDEP) {
2250                                 *pagedeppp = WK_PAGEDEP(wk);
2251                                 return (1);
2252                         }
2253                 }
2254         }
2255         pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2256         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2257         if (ret) {
2258                 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2259                         WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2260                 return (1);
2261         }
2262         if ((flags & DEPALLOC) == 0)
2263                 return (0);
2264         FREE_LOCK(ump);
2265         pagedep = malloc(sizeof(struct pagedep),
2266             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2267         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2268         ACQUIRE_LOCK(ump);
2269         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2270         if (*pagedeppp) {
2271                 /*
2272                  * This should never happen since we only create pagedeps
2273                  * with the vnode lock held.  Could be an assert.
2274                  */
2275                 WORKITEM_FREE(pagedep, D_PAGEDEP);
2276                 return (ret);
2277         }
2278         pagedep->pd_ino = ino;
2279         pagedep->pd_lbn = lbn;
2280         LIST_INIT(&pagedep->pd_dirremhd);
2281         LIST_INIT(&pagedep->pd_pendinghd);
2282         for (i = 0; i < DAHASHSZ; i++)
2283                 LIST_INIT(&pagedep->pd_diraddhd[i]);
2284         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2285         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2286         *pagedeppp = pagedep;
2287         return (0);
2288 }
2289
2290 /*
2291  * Structures and routines associated with inodedep caching.
2292  */
2293 #define INODEDEP_HASH(ump, inum) \
2294       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2295
2296 static int
2297 inodedep_find(struct inodedep_hashhead *inodedephd,
2298         ino_t inum,
2299         struct inodedep **inodedeppp)
2300 {
2301         struct inodedep *inodedep;
2302
2303         LIST_FOREACH(inodedep, inodedephd, id_hash)
2304                 if (inum == inodedep->id_ino)
2305                         break;
2306         if (inodedep) {
2307                 *inodedeppp = inodedep;
2308                 return (1);
2309         }
2310         *inodedeppp = NULL;
2311
2312         return (0);
2313 }
2314 /*
2315  * Look up an inodedep. Return 1 if found, 0 if not found.
2316  * If not found, allocate if DEPALLOC flag is passed.
2317  * Found or allocated entry is returned in inodedeppp.
2318  */
2319 static int
2320 inodedep_lookup(struct mount *mp,
2321         ino_t inum,
2322         int flags,
2323         struct inodedep **inodedeppp)
2324 {
2325         struct inodedep *inodedep;
2326         struct inodedep_hashhead *inodedephd;
2327         struct ufsmount *ump;
2328         struct fs *fs;
2329
2330         ump = VFSTOUFS(mp);
2331         LOCK_OWNED(ump);
2332         fs = ump->um_fs;
2333         inodedephd = INODEDEP_HASH(ump, inum);
2334
2335         if (inodedep_find(inodedephd, inum, inodedeppp))
2336                 return (1);
2337         if ((flags & DEPALLOC) == 0)
2338                 return (0);
2339         /*
2340          * If the system is over its limit and our filesystem is
2341          * responsible for more than our share of that usage and
2342          * we are not in a rush, request some inodedep cleanup.
2343          */
2344         if (softdep_excess_items(ump, D_INODEDEP))
2345                 schedule_cleanup(mp);
2346         else
2347                 FREE_LOCK(ump);
2348         inodedep = malloc(sizeof(struct inodedep),
2349                 M_INODEDEP, M_SOFTDEP_FLAGS);
2350         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2351         ACQUIRE_LOCK(ump);
2352         if (inodedep_find(inodedephd, inum, inodedeppp)) {
2353                 WORKITEM_FREE(inodedep, D_INODEDEP);
2354                 return (1);
2355         }
2356         inodedep->id_fs = fs;
2357         inodedep->id_ino = inum;
2358         inodedep->id_state = ALLCOMPLETE;
2359         inodedep->id_nlinkdelta = 0;
2360         inodedep->id_nlinkwrote = -1;
2361         inodedep->id_savedino1 = NULL;
2362         inodedep->id_savedsize = -1;
2363         inodedep->id_savedextsize = -1;
2364         inodedep->id_savednlink = -1;
2365         inodedep->id_bmsafemap = NULL;
2366         inodedep->id_mkdiradd = NULL;
2367         LIST_INIT(&inodedep->id_dirremhd);
2368         LIST_INIT(&inodedep->id_pendinghd);
2369         LIST_INIT(&inodedep->id_inowait);
2370         LIST_INIT(&inodedep->id_bufwait);
2371         TAILQ_INIT(&inodedep->id_inoreflst);
2372         TAILQ_INIT(&inodedep->id_inoupdt);
2373         TAILQ_INIT(&inodedep->id_newinoupdt);
2374         TAILQ_INIT(&inodedep->id_extupdt);
2375         TAILQ_INIT(&inodedep->id_newextupdt);
2376         TAILQ_INIT(&inodedep->id_freeblklst);
2377         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2378         *inodedeppp = inodedep;
2379         return (0);
2380 }
2381
2382 /*
2383  * Structures and routines associated with newblk caching.
2384  */
2385 #define NEWBLK_HASH(ump, inum) \
2386         (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2387
2388 static int
2389 newblk_find(struct newblk_hashhead *newblkhd,
2390         ufs2_daddr_t newblkno,
2391         int flags,
2392         struct newblk **newblkpp)
2393 {
2394         struct newblk *newblk;
2395
2396         LIST_FOREACH(newblk, newblkhd, nb_hash) {
2397                 if (newblkno != newblk->nb_newblkno)
2398                         continue;
2399                 /*
2400                  * If we're creating a new dependency don't match those that
2401                  * have already been converted to allocdirects.  This is for
2402                  * a frag extend.
2403                  */
2404                 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2405                         continue;
2406                 break;
2407         }
2408         if (newblk) {
2409                 *newblkpp = newblk;
2410                 return (1);
2411         }
2412         *newblkpp = NULL;
2413         return (0);
2414 }
2415
2416 /*
2417  * Look up a newblk. Return 1 if found, 0 if not found.
2418  * If not found, allocate if DEPALLOC flag is passed.
2419  * Found or allocated entry is returned in newblkpp.
2420  */
2421 static int
2422 newblk_lookup(struct mount *mp,
2423         ufs2_daddr_t newblkno,
2424         int flags,
2425         struct newblk **newblkpp)
2426 {
2427         struct newblk *newblk;
2428         struct newblk_hashhead *newblkhd;
2429         struct ufsmount *ump;
2430
2431         ump = VFSTOUFS(mp);
2432         LOCK_OWNED(ump);
2433         newblkhd = NEWBLK_HASH(ump, newblkno);
2434         if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2435                 return (1);
2436         if ((flags & DEPALLOC) == 0)
2437                 return (0);
2438         if (softdep_excess_items(ump, D_NEWBLK) ||
2439             softdep_excess_items(ump, D_ALLOCDIRECT) ||
2440             softdep_excess_items(ump, D_ALLOCINDIR))
2441                 schedule_cleanup(mp);
2442         else
2443                 FREE_LOCK(ump);
2444         newblk = malloc(sizeof(union allblk), M_NEWBLK,
2445             M_SOFTDEP_FLAGS | M_ZERO);
2446         workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2447         ACQUIRE_LOCK(ump);
2448         if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2449                 WORKITEM_FREE(newblk, D_NEWBLK);
2450                 return (1);
2451         }
2452         newblk->nb_freefrag = NULL;
2453         LIST_INIT(&newblk->nb_indirdeps);
2454         LIST_INIT(&newblk->nb_newdirblk);
2455         LIST_INIT(&newblk->nb_jwork);
2456         newblk->nb_state = ATTACHED;
2457         newblk->nb_newblkno = newblkno;
2458         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2459         *newblkpp = newblk;
2460         return (0);
2461 }
2462
2463 /*
2464  * Structures and routines associated with freed indirect block caching.
2465  */
2466 #define INDIR_HASH(ump, blkno) \
2467         (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2468
2469 /*
2470  * Lookup an indirect block in the indir hash table.  The freework is
2471  * removed and potentially freed.  The caller must do a blocking journal
2472  * write before writing to the blkno.
2473  */
2474 static int
2475 indirblk_lookup(struct mount *mp, ufs2_daddr_t blkno)
2476 {
2477         struct freework *freework;
2478         struct indir_hashhead *wkhd;
2479         struct ufsmount *ump;
2480
2481         ump = VFSTOUFS(mp);
2482         wkhd = INDIR_HASH(ump, blkno);
2483         TAILQ_FOREACH(freework, wkhd, fw_next) {
2484                 if (freework->fw_blkno != blkno)
2485                         continue;
2486                 indirblk_remove(freework);
2487                 return (1);
2488         }
2489         return (0);
2490 }
2491
2492 /*
2493  * Insert an indirect block represented by freework into the indirblk
2494  * hash table so that it may prevent the block from being re-used prior
2495  * to the journal being written.
2496  */
2497 static void
2498 indirblk_insert(struct freework *freework)
2499 {
2500         struct jblocks *jblocks;
2501         struct jseg *jseg;
2502         struct ufsmount *ump;
2503
2504         ump = VFSTOUFS(freework->fw_list.wk_mp);
2505         jblocks = ump->softdep_jblocks;
2506         jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2507         if (jseg == NULL)
2508                 return;
2509
2510         LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2511         TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2512             fw_next);
2513         freework->fw_state &= ~DEPCOMPLETE;
2514 }
2515
2516 static void
2517 indirblk_remove(struct freework *freework)
2518 {
2519         struct ufsmount *ump;
2520
2521         ump = VFSTOUFS(freework->fw_list.wk_mp);
2522         LIST_REMOVE(freework, fw_segs);
2523         TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2524         freework->fw_state |= DEPCOMPLETE;
2525         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2526                 WORKITEM_FREE(freework, D_FREEWORK);
2527 }
2528
2529 /*
2530  * Executed during filesystem system initialization before
2531  * mounting any filesystems.
2532  */
2533 void
2534 softdep_initialize(void)
2535 {
2536
2537         TAILQ_INIT(&softdepmounts);
2538 #ifdef __LP64__
2539         max_softdeps = desiredvnodes * 4;
2540 #else
2541         max_softdeps = desiredvnodes * 2;
2542 #endif
2543
2544         /* initialise bioops hack */
2545         bioops.io_start = softdep_disk_io_initiation;
2546         bioops.io_complete = softdep_disk_write_complete;
2547         bioops.io_deallocate = softdep_deallocate_dependencies;
2548         bioops.io_countdeps = softdep_count_dependencies;
2549         softdep_ast_cleanup = softdep_ast_cleanup_proc;
2550
2551         /* Initialize the callout with an mtx. */
2552         callout_init_mtx(&softdep_callout, &lk, 0);
2553 }
2554
2555 /*
2556  * Executed after all filesystems have been unmounted during
2557  * filesystem module unload.
2558  */
2559 void
2560 softdep_uninitialize(void)
2561 {
2562
2563         /* clear bioops hack */
2564         bioops.io_start = NULL;
2565         bioops.io_complete = NULL;
2566         bioops.io_deallocate = NULL;
2567         bioops.io_countdeps = NULL;
2568         softdep_ast_cleanup = NULL;
2569
2570         callout_drain(&softdep_callout);
2571 }
2572
2573 /*
2574  * Called at mount time to notify the dependency code that a
2575  * filesystem wishes to use it.
2576  */
2577 int
2578 softdep_mount(struct vnode *devvp,
2579         struct mount *mp,
2580         struct fs *fs,
2581         struct ucred *cred)
2582 {
2583         struct csum_total cstotal;
2584         struct mount_softdeps *sdp;
2585         struct ufsmount *ump;
2586         struct cg *cgp;
2587         struct buf *bp;
2588         uint64_t cyl, i;
2589         int error;
2590
2591         ump = VFSTOUFS(mp);
2592
2593         sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2594             M_WAITOK | M_ZERO);
2595         rw_init(&sdp->sd_fslock, "SUrw");
2596         sdp->sd_ump = ump;
2597         LIST_INIT(&sdp->sd_workitem_pending);
2598         LIST_INIT(&sdp->sd_journal_pending);
2599         TAILQ_INIT(&sdp->sd_unlinked);
2600         LIST_INIT(&sdp->sd_dirtycg);
2601         sdp->sd_worklist_tail = NULL;
2602         sdp->sd_on_worklist = 0;
2603         sdp->sd_deps = 0;
2604         LIST_INIT(&sdp->sd_mkdirlisthd);
2605         sdp->sd_pdhash = hashinit(desiredvnodes / 5, M_PAGEDEP,
2606             &sdp->sd_pdhashsize);
2607         sdp->sd_pdnextclean = 0;
2608         sdp->sd_idhash = hashinit(desiredvnodes, M_INODEDEP,
2609             &sdp->sd_idhashsize);
2610         sdp->sd_idnextclean = 0;
2611         sdp->sd_newblkhash = hashinit(max_softdeps / 2,  M_NEWBLK,
2612             &sdp->sd_newblkhashsize);
2613         sdp->sd_bmhash = hashinit(1024, M_BMSAFEMAP, &sdp->sd_bmhashsize);
2614         i = 1 << (ffs(desiredvnodes / 10) - 1);
2615         sdp->sd_indirhash = malloc(i * sizeof(struct indir_hashhead),
2616             M_FREEWORK, M_WAITOK);
2617         sdp->sd_indirhashsize = i - 1;
2618         for (i = 0; i <= sdp->sd_indirhashsize; i++)
2619                 TAILQ_INIT(&sdp->sd_indirhash[i]);
2620         for (i = 0; i <= D_LAST; i++)
2621                 LIST_INIT(&sdp->sd_alldeps[i]);
2622         ACQUIRE_GBLLOCK(&lk);
2623         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2624         FREE_GBLLOCK(&lk);
2625
2626         ump->um_softdep = sdp;
2627         MNT_ILOCK(mp);
2628         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2629         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2630                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2631                     MNTK_SOFTDEP | MNTK_NOASYNC;
2632         }
2633         MNT_IUNLOCK(mp);
2634
2635         if ((fs->fs_flags & FS_SUJ) &&
2636             (error = journal_mount(mp, fs, cred)) != 0) {
2637                 printf("Failed to start journal: %d\n", error);
2638                 softdep_unmount(mp);
2639                 return (error);
2640         }
2641         /*
2642          * Start our flushing thread in the bufdaemon process.
2643          */
2644         ACQUIRE_LOCK(ump);
2645         ump->softdep_flags |= FLUSH_STARTING;
2646         FREE_LOCK(ump);
2647         kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2648             &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2649             mp->mnt_stat.f_mntonname);
2650         ACQUIRE_LOCK(ump);
2651         while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2652                 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2653                     hz / 2);
2654         }
2655         FREE_LOCK(ump);
2656         /*
2657          * When doing soft updates, the counters in the
2658          * superblock may have gotten out of sync. Recomputation
2659          * can take a long time and can be deferred for background
2660          * fsck.  However, the old behavior of scanning the cylinder
2661          * groups and recalculating them at mount time is available
2662          * by setting vfs.ffs.compute_summary_at_mount to one.
2663          */
2664         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2665                 return (0);
2666         bzero(&cstotal, sizeof cstotal);
2667         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2668                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2669                     fs->fs_cgsize, cred, &bp)) != 0) {
2670                         brelse(bp);
2671                         softdep_unmount(mp);
2672                         return (error);
2673                 }
2674                 cgp = (struct cg *)bp->b_data;
2675                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2676                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2677                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2678                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2679                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
2680                 brelse(bp);
2681         }
2682 #ifdef INVARIANTS
2683         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2684                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2685 #endif
2686         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2687         return (0);
2688 }
2689
2690 void
2691 softdep_unmount(struct mount *mp)
2692 {
2693         struct ufsmount *ump;
2694         struct mount_softdeps *ums;
2695
2696         ump = VFSTOUFS(mp);
2697         KASSERT(ump->um_softdep != NULL,
2698             ("softdep_unmount called on non-softdep filesystem"));
2699         MNT_ILOCK(mp);
2700         mp->mnt_flag &= ~MNT_SOFTDEP;
2701         if ((mp->mnt_flag & MNT_SUJ) == 0) {
2702                 MNT_IUNLOCK(mp);
2703         } else {
2704                 mp->mnt_flag &= ~MNT_SUJ;
2705                 MNT_IUNLOCK(mp);
2706                 journal_unmount(ump);
2707         }
2708         /*
2709          * Shut down our flushing thread. Check for NULL is if
2710          * softdep_mount errors out before the thread has been created.
2711          */
2712         if (ump->softdep_flushtd != NULL) {
2713                 ACQUIRE_LOCK(ump);
2714                 ump->softdep_flags |= FLUSH_EXIT;
2715                 wakeup(&ump->softdep_flushtd);
2716                 while ((ump->softdep_flags & FLUSH_EXIT) != 0) {
2717                         msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM,
2718                             "sdwait", 0);
2719                 }
2720                 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2721                     ("Thread shutdown failed"));
2722                 FREE_LOCK(ump);
2723         }
2724
2725         /*
2726          * We are no longer have softdep structure attached to ump.
2727          */
2728         ums = ump->um_softdep;
2729         ACQUIRE_GBLLOCK(&lk);
2730         TAILQ_REMOVE(&softdepmounts, ums, sd_next);
2731         FREE_GBLLOCK(&lk);
2732         ump->um_softdep = NULL;
2733
2734         KASSERT(ums->sd_on_journal == 0,
2735             ("ump %p ums %p on_journal %d", ump, ums, ums->sd_on_journal));
2736         KASSERT(ums->sd_on_worklist == 0,
2737             ("ump %p ums %p on_worklist %d", ump, ums, ums->sd_on_worklist));
2738         KASSERT(ums->sd_deps == 0,
2739             ("ump %p ums %p deps %d", ump, ums, ums->sd_deps));
2740
2741         /*
2742          * Free up our resources.
2743          */
2744         rw_destroy(&ums->sd_fslock);
2745         hashdestroy(ums->sd_pdhash, M_PAGEDEP, ums->sd_pdhashsize);
2746         hashdestroy(ums->sd_idhash, M_INODEDEP, ums->sd_idhashsize);
2747         hashdestroy(ums->sd_newblkhash, M_NEWBLK, ums->sd_newblkhashsize);
2748         hashdestroy(ums->sd_bmhash, M_BMSAFEMAP, ums->sd_bmhashsize);
2749         free(ums->sd_indirhash, M_FREEWORK);
2750 #ifdef INVARIANTS
2751         for (int i = 0; i <= D_LAST; i++) {
2752                 KASSERT(ums->sd_curdeps[i] == 0,
2753                     ("Unmount %s: Dep type %s != 0 (%jd)", ump->um_fs->fs_fsmnt,
2754                     TYPENAME(i), (intmax_t)ums->sd_curdeps[i]));
2755                 KASSERT(LIST_EMPTY(&ums->sd_alldeps[i]),
2756                     ("Unmount %s: Dep type %s not empty (%p)",
2757                     ump->um_fs->fs_fsmnt,
2758                     TYPENAME(i), LIST_FIRST(&ums->sd_alldeps[i])));
2759         }
2760 #endif
2761         free(ums, M_MOUNTDATA);
2762 }
2763
2764 static struct jblocks *
2765 jblocks_create(void)
2766 {
2767         struct jblocks *jblocks;
2768
2769         jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2770         TAILQ_INIT(&jblocks->jb_segs);
2771         jblocks->jb_avail = 10;
2772         jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2773             M_JBLOCKS, M_WAITOK | M_ZERO);
2774
2775         return (jblocks);
2776 }
2777
2778 static ufs2_daddr_t
2779 jblocks_alloc(struct jblocks *jblocks,
2780         int bytes,
2781         int *actual)
2782 {
2783         ufs2_daddr_t daddr;
2784         struct jextent *jext;
2785         int freecnt;
2786         int blocks;
2787
2788         blocks = bytes / DEV_BSIZE;
2789         jext = &jblocks->jb_extent[jblocks->jb_head];
2790         freecnt = jext->je_blocks - jblocks->jb_off;
2791         if (freecnt == 0) {
2792                 jblocks->jb_off = 0;
2793                 if (++jblocks->jb_head > jblocks->jb_used)
2794                         jblocks->jb_head = 0;
2795                 jext = &jblocks->jb_extent[jblocks->jb_head];
2796                 freecnt = jext->je_blocks;
2797         }
2798         if (freecnt > blocks)
2799                 freecnt = blocks;
2800         *actual = freecnt * DEV_BSIZE;
2801         daddr = jext->je_daddr + jblocks->jb_off;
2802         jblocks->jb_off += freecnt;
2803         jblocks->jb_free -= freecnt;
2804
2805         return (daddr);
2806 }
2807
2808 static void
2809 jblocks_free(struct jblocks *jblocks,
2810         struct mount *mp,
2811         int bytes)
2812 {
2813
2814         LOCK_OWNED(VFSTOUFS(mp));
2815         jblocks->jb_free += bytes / DEV_BSIZE;
2816         if (jblocks->jb_suspended)
2817                 worklist_speedup(mp);
2818         wakeup(jblocks);
2819 }
2820
2821 static void
2822 jblocks_destroy(struct jblocks *jblocks)
2823 {
2824
2825         if (jblocks->jb_extent)
2826                 free(jblocks->jb_extent, M_JBLOCKS);
2827         free(jblocks, M_JBLOCKS);
2828 }
2829
2830 static void
2831 jblocks_add(struct jblocks *jblocks,
2832         ufs2_daddr_t daddr,
2833         int blocks)
2834 {
2835         struct jextent *jext;
2836
2837         jblocks->jb_blocks += blocks;
2838         jblocks->jb_free += blocks;
2839         jext = &jblocks->jb_extent[jblocks->jb_used];
2840         /* Adding the first block. */
2841         if (jext->je_daddr == 0) {
2842                 jext->je_daddr = daddr;
2843                 jext->je_blocks = blocks;
2844                 return;
2845         }
2846         /* Extending the last extent. */
2847         if (jext->je_daddr + jext->je_blocks == daddr) {
2848                 jext->je_blocks += blocks;
2849                 return;
2850         }
2851         /* Adding a new extent. */
2852         if (++jblocks->jb_used == jblocks->jb_avail) {
2853                 jblocks->jb_avail *= 2;
2854                 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2855                     M_JBLOCKS, M_WAITOK | M_ZERO);
2856                 memcpy(jext, jblocks->jb_extent,
2857                     sizeof(struct jextent) * jblocks->jb_used);
2858                 free(jblocks->jb_extent, M_JBLOCKS);
2859                 jblocks->jb_extent = jext;
2860         }
2861         jext = &jblocks->jb_extent[jblocks->jb_used];
2862         jext->je_daddr = daddr;
2863         jext->je_blocks = blocks;
2864         return;
2865 }
2866
2867 int
2868 softdep_journal_lookup(struct mount *mp, struct vnode **vpp)
2869 {
2870         struct componentname cnp;
2871         struct vnode *dvp;
2872         ino_t sujournal;
2873         int error;
2874
2875         error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
2876         if (error)
2877                 return (error);
2878         bzero(&cnp, sizeof(cnp));
2879         cnp.cn_nameiop = LOOKUP;
2880         cnp.cn_flags = ISLASTCN;
2881         cnp.cn_thread = curthread;
2882         cnp.cn_cred = curthread->td_ucred;
2883         cnp.cn_pnbuf = SUJ_FILE;
2884         cnp.cn_nameptr = SUJ_FILE;
2885         cnp.cn_namelen = strlen(SUJ_FILE);
2886         error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2887         vput(dvp);
2888         if (error != 0)
2889                 return (error);
2890         error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2891         return (error);
2892 }
2893
2894 /*
2895  * Open and verify the journal file.
2896  */
2897 static int
2898 journal_mount(struct mount *mp,
2899         struct fs *fs,
2900         struct ucred *cred)
2901 {
2902         struct jblocks *jblocks;
2903         struct ufsmount *ump;
2904         struct vnode *vp;
2905         struct inode *ip;
2906         ufs2_daddr_t blkno;
2907         int bcount;
2908         int error;
2909         int i;
2910
2911         ump = VFSTOUFS(mp);
2912         ump->softdep_journal_tail = NULL;
2913         ump->softdep_on_journal = 0;
2914         ump->softdep_accdeps = 0;
2915         ump->softdep_req = 0;
2916         ump->softdep_jblocks = NULL;
2917         error = softdep_journal_lookup(mp, &vp);
2918         if (error != 0) {
2919                 printf("Failed to find journal.  Use tunefs to create one\n");
2920                 return (error);
2921         }
2922         ip = VTOI(vp);
2923         if (ip->i_size < SUJ_MIN) {
2924                 error = ENOSPC;
2925                 goto out;
2926         }
2927         bcount = lblkno(fs, ip->i_size);        /* Only use whole blocks. */
2928         jblocks = jblocks_create();
2929         for (i = 0; i < bcount; i++) {
2930                 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2931                 if (error)
2932                         break;
2933                 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2934         }
2935         if (error) {
2936                 jblocks_destroy(jblocks);
2937                 goto out;
2938         }
2939         jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
2940         jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2941         ump->softdep_jblocks = jblocks;
2942
2943         MNT_ILOCK(mp);
2944         mp->mnt_flag |= MNT_SUJ;
2945         MNT_IUNLOCK(mp);
2946
2947         /*
2948          * Only validate the journal contents if the
2949          * filesystem is clean, otherwise we write the logs
2950          * but they'll never be used.  If the filesystem was
2951          * still dirty when we mounted it the journal is
2952          * invalid and a new journal can only be valid if it
2953          * starts from a clean mount.
2954          */
2955         if (fs->fs_clean) {
2956                 DIP_SET(ip, i_modrev, fs->fs_mtime);
2957                 ip->i_flags |= IN_MODIFIED;
2958                 ffs_update(vp, 1);
2959         }
2960 out:
2961         vput(vp);
2962         return (error);
2963 }
2964
2965 static void
2966 journal_unmount(struct ufsmount *ump)
2967 {
2968
2969         if (ump->softdep_jblocks)
2970                 jblocks_destroy(ump->softdep_jblocks);
2971         ump->softdep_jblocks = NULL;
2972 }
2973
2974 /*
2975  * Called when a journal record is ready to be written.  Space is allocated
2976  * and the journal entry is created when the journal is flushed to stable
2977  * store.
2978  */
2979 static void
2980 add_to_journal(struct worklist *wk)
2981 {
2982         struct ufsmount *ump;
2983
2984         ump = VFSTOUFS(wk->wk_mp);
2985         LOCK_OWNED(ump);
2986         if (wk->wk_state & ONWORKLIST)
2987                 panic("add_to_journal: %s(0x%X) already on list",
2988                     TYPENAME(wk->wk_type), wk->wk_state);
2989         wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2990         if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2991                 ump->softdep_jblocks->jb_age = ticks;
2992                 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2993         } else
2994                 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2995         ump->softdep_journal_tail = wk;
2996         ump->softdep_on_journal += 1;
2997 }
2998
2999 /*
3000  * Remove an arbitrary item for the journal worklist maintain the tail
3001  * pointer.  This happens when a new operation obviates the need to
3002  * journal an old operation.
3003  */
3004 static void
3005 remove_from_journal(struct worklist *wk)
3006 {
3007         struct ufsmount *ump;
3008
3009         ump = VFSTOUFS(wk->wk_mp);
3010         LOCK_OWNED(ump);
3011 #ifdef INVARIANTS
3012         {
3013                 struct worklist *wkn;
3014
3015                 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
3016                         if (wkn == wk)
3017                                 break;
3018                 if (wkn == NULL)
3019                         panic("remove_from_journal: %p is not in journal", wk);
3020         }
3021 #endif
3022         /*
3023          * We emulate a TAILQ to save space in most structures which do not
3024          * require TAILQ semantics.  Here we must update the tail position
3025          * when removing the tail which is not the final entry. This works
3026          * only if the worklist linkage are at the beginning of the structure.
3027          */
3028         if (ump->softdep_journal_tail == wk)
3029                 ump->softdep_journal_tail =
3030                     (struct worklist *)wk->wk_list.le_prev;
3031         WORKLIST_REMOVE(wk);
3032         ump->softdep_on_journal -= 1;
3033 }
3034
3035 /*
3036  * Check for journal space as well as dependency limits so the prelink
3037  * code can throttle both journaled and non-journaled filesystems.
3038  * Threshold is 0 for low and 1 for min.
3039  */
3040 static int
3041 journal_space(struct ufsmount *ump, int thresh)
3042 {
3043         struct jblocks *jblocks;
3044         int limit, avail;
3045
3046         jblocks = ump->softdep_jblocks;
3047         if (jblocks == NULL)
3048                 return (1);
3049         /*
3050          * We use a tighter restriction here to prevent request_cleanup()
3051          * running in threads from running into locks we currently hold.
3052          * We have to be over the limit and our filesystem has to be
3053          * responsible for more than our share of that usage.
3054          */
3055         limit = (max_softdeps / 10) * 9;
3056         if (dep_current[D_INODEDEP] > limit &&
3057             ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
3058                 return (0);
3059         if (thresh)
3060                 thresh = jblocks->jb_min;
3061         else
3062                 thresh = jblocks->jb_low;
3063         avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
3064         avail = jblocks->jb_free - avail;
3065
3066         return (avail > thresh);
3067 }
3068
3069 static void
3070 journal_suspend(struct ufsmount *ump)
3071 {
3072         struct jblocks *jblocks;
3073         struct mount *mp;
3074         bool set;
3075
3076         mp = UFSTOVFS(ump);
3077         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0)
3078                 return;
3079
3080         jblocks = ump->softdep_jblocks;
3081         vfs_op_enter(mp);
3082         set = false;
3083         MNT_ILOCK(mp);
3084         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
3085                 stat_journal_min++;
3086                 mp->mnt_kern_flag |= MNTK_SUSPEND;
3087                 mp->mnt_susp_owner = ump->softdep_flushtd;
3088                 set = true;
3089         }
3090         jblocks->jb_suspended = 1;
3091         MNT_IUNLOCK(mp);
3092         if (!set)
3093                 vfs_op_exit(mp);
3094 }
3095
3096 static int
3097 journal_unsuspend(struct ufsmount *ump)
3098 {
3099         struct jblocks *jblocks;
3100         struct mount *mp;
3101
3102         mp = UFSTOVFS(ump);
3103         jblocks = ump->softdep_jblocks;
3104
3105         if (jblocks != NULL && jblocks->jb_suspended &&
3106             journal_space(ump, jblocks->jb_min)) {
3107                 jblocks->jb_suspended = 0;
3108                 FREE_LOCK(ump);
3109                 mp->mnt_susp_owner = curthread;
3110                 vfs_write_resume(mp, 0);
3111                 ACQUIRE_LOCK(ump);
3112                 return (1);
3113         }
3114         return (0);
3115 }
3116
3117 static void
3118 journal_check_space(struct ufsmount *ump)
3119 {
3120         struct mount *mp;
3121
3122         LOCK_OWNED(ump);
3123
3124         if (journal_space(ump, 0) == 0) {
3125                 softdep_speedup(ump);
3126                 mp = UFSTOVFS(ump);
3127                 FREE_LOCK(ump);
3128                 VFS_SYNC(mp, MNT_NOWAIT);
3129                 ffs_sbupdate(ump, MNT_WAIT, 0);
3130                 ACQUIRE_LOCK(ump);
3131                 if (journal_space(ump, 1) == 0)
3132                         journal_suspend(ump);
3133         }
3134 }
3135
3136 /*
3137  * Called before any allocation function to be certain that there is
3138  * sufficient space in the journal prior to creating any new records.
3139  * Since in the case of block allocation we may have multiple locked
3140  * buffers at the time of the actual allocation we can not block
3141  * when the journal records are created.  Doing so would create a deadlock
3142  * if any of these buffers needed to be flushed to reclaim space.  Instead
3143  * we require a sufficiently large amount of available space such that
3144  * each thread in the system could have passed this allocation check and
3145  * still have sufficient free space.  With 20% of a minimum journal size
3146  * of 1MB we have 6553 records available.
3147  */
3148 int
3149 softdep_prealloc(struct vnode *vp, int waitok)
3150 {
3151         struct ufsmount *ump;
3152
3153         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
3154             ("softdep_prealloc called on non-softdep filesystem"));
3155         /*
3156          * Nothing to do if we are not running journaled soft updates.
3157          * If we currently hold the snapshot lock, we must avoid
3158          * handling other resources that could cause deadlock.  Do not
3159          * touch quotas vnode since it is typically recursed with
3160          * other vnode locks held.
3161          */
3162         if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3163             (vp->v_vflag & VV_SYSTEM) != 0)
3164                 return (0);
3165         ump = VFSTOUFS(vp->v_mount);
3166         ACQUIRE_LOCK(ump);
3167         if (journal_space(ump, 0)) {
3168                 FREE_LOCK(ump);
3169                 return (0);
3170         }
3171         stat_journal_low++;
3172         FREE_LOCK(ump);
3173         if (waitok == MNT_NOWAIT)
3174                 return (ENOSPC);
3175         /*
3176          * Attempt to sync this vnode once to flush any journal
3177          * work attached to it.
3178          */
3179         if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3180                 ffs_syncvnode(vp, waitok, 0);
3181         ACQUIRE_LOCK(ump);
3182         process_removes(vp);
3183         process_truncates(vp);
3184         journal_check_space(ump);
3185         FREE_LOCK(ump);
3186
3187         return (0);
3188 }
3189
3190 /*
3191  * Try hard to sync all data and metadata for the vnode, and workitems
3192  * flushing which might conflict with the vnode lock.  This is a
3193  * helper for softdep_prerename().
3194  */
3195 static int
3196 softdep_prerename_vnode(struct ufsmount *ump, struct vnode *vp)
3197 {
3198         int error;
3199
3200         ASSERT_VOP_ELOCKED(vp, "prehandle");
3201         if (vp->v_data == NULL)
3202                 return (0);
3203         error = VOP_FSYNC(vp, MNT_WAIT, curthread);
3204         if (error != 0)
3205                 return (error);
3206         ACQUIRE_LOCK(ump);
3207         process_removes(vp);
3208         process_truncates(vp);
3209         FREE_LOCK(ump);
3210         return (0);
3211 }
3212
3213 /*
3214  * Must be called from VOP_RENAME() after all vnodes are locked.
3215  * Ensures that there is enough journal space for rename.  It is
3216  * sufficiently different from softdep_prelink() by having to handle
3217  * four vnodes.
3218  */
3219 int
3220 softdep_prerename(struct vnode *fdvp,
3221         struct vnode *fvp,
3222         struct vnode *tdvp,
3223         struct vnode *tvp)
3224 {
3225         struct ufsmount *ump;
3226         int error;
3227
3228         ump = VFSTOUFS(fdvp->v_mount);
3229
3230         if (journal_space(ump, 0))
3231                 return (0);
3232
3233         VOP_UNLOCK(tdvp);
3234         VOP_UNLOCK(fvp);
3235         if (tvp != NULL && tvp != tdvp)
3236                 VOP_UNLOCK(tvp);
3237
3238         error = softdep_prerename_vnode(ump, fdvp);
3239         VOP_UNLOCK(fdvp);
3240         if (error != 0)
3241                 return (error);
3242
3243         VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY);
3244         error = softdep_prerename_vnode(ump, fvp);
3245         VOP_UNLOCK(fvp);
3246         if (error != 0)
3247                 return (error);
3248
3249         if (tdvp != fdvp) {
3250                 VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY);
3251                 error = softdep_prerename_vnode(ump, tdvp);
3252                 VOP_UNLOCK(tdvp);
3253                 if (error != 0)
3254                         return (error);
3255         }
3256
3257         if (tvp != fvp && tvp != NULL) {
3258                 VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY);
3259                 error = softdep_prerename_vnode(ump, tvp);
3260                 VOP_UNLOCK(tvp);
3261                 if (error != 0)
3262                         return (error);
3263         }
3264
3265         ACQUIRE_LOCK(ump);
3266         softdep_speedup(ump);
3267         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3268         journal_check_space(ump);
3269         FREE_LOCK(ump);
3270         return (ERELOOKUP);
3271 }
3272
3273 /*
3274  * Before adjusting a link count on a vnode verify that we have sufficient
3275  * journal space.  If not, process operations that depend on the currently
3276  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3277  * and softdep flush threads can not acquire these locks to reclaim space.
3278  *
3279  * Returns 0 if all owned locks are still valid and were not dropped
3280  * in the process, in other case it returns either an error from sync,
3281  * or ERELOOKUP if any of the locks were re-acquired.  In the later
3282  * case, the state of the vnodes cannot be relied upon and our VFS
3283  * syscall must be restarted at top level from the lookup.
3284  */
3285 int
3286 softdep_prelink(struct vnode *dvp,
3287         struct vnode *vp,
3288         struct componentname *cnp)
3289 {
3290         struct ufsmount *ump;
3291         struct nameidata *ndp;
3292
3293         ASSERT_VOP_ELOCKED(dvp, "prelink dvp");
3294         if (vp != NULL)
3295                 ASSERT_VOP_ELOCKED(vp, "prelink vp");
3296         ump = VFSTOUFS(dvp->v_mount);
3297
3298         /*
3299          * Nothing to do if we have sufficient journal space.  We skip
3300          * flushing when vp is a snapshot to avoid deadlock where
3301          * another thread is trying to update the inodeblock for dvp
3302          * and is waiting on snaplk that vp holds.
3303          */
3304         if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp))))
3305                 return (0);
3306
3307         /*
3308          * Check if the journal space consumption can in theory be
3309          * accounted on dvp and vp.  If the vnodes metadata was not
3310          * changed comparing with the previous round-trip into
3311          * softdep_prelink(), as indicated by the seqc generation
3312          * recorded in the nameidata, then there is no point in
3313          * starting the sync.
3314          */
3315         ndp = __containerof(cnp, struct nameidata, ni_cnd);
3316         if (!seqc_in_modify(ndp->ni_dvp_seqc) &&
3317             vn_seqc_consistent(dvp, ndp->ni_dvp_seqc) &&
3318             (vp == NULL || (!seqc_in_modify(ndp->ni_vp_seqc) &&
3319             vn_seqc_consistent(vp, ndp->ni_vp_seqc))))
3320                 return (0);
3321
3322         stat_journal_low++;
3323         if (vp != NULL) {
3324                 VOP_UNLOCK(dvp);
3325                 ffs_syncvnode(vp, MNT_NOWAIT, 0);
3326                 vn_lock_pair(dvp, false, LK_EXCLUSIVE, vp, true, LK_EXCLUSIVE);
3327                 if (dvp->v_data == NULL)
3328                         goto out;
3329         }
3330         if (vp != NULL)
3331                 VOP_UNLOCK(vp);
3332         ffs_syncvnode(dvp, MNT_WAIT, 0);
3333         /* Process vp before dvp as it may create .. removes. */
3334         if (vp != NULL) {
3335                 VOP_UNLOCK(dvp);
3336                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3337                 if (vp->v_data == NULL) {
3338                         vn_lock_pair(dvp, false, LK_EXCLUSIVE, vp, true,
3339                             LK_EXCLUSIVE);
3340                         goto out;
3341                 }
3342                 ACQUIRE_LOCK(ump);
3343                 process_removes(vp);
3344                 process_truncates(vp);
3345                 FREE_LOCK(ump);
3346                 VOP_UNLOCK(vp);
3347                 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
3348                 if (dvp->v_data == NULL) {
3349                         vn_lock_pair(dvp, true, LK_EXCLUSIVE, vp, false,
3350                             LK_EXCLUSIVE);
3351                         goto out;
3352                 }
3353         }
3354
3355         ACQUIRE_LOCK(ump);
3356         process_removes(dvp);
3357         process_truncates(dvp);
3358         VOP_UNLOCK(dvp);
3359         softdep_speedup(ump);
3360
3361         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3362         journal_check_space(ump);
3363         FREE_LOCK(ump);
3364
3365         vn_lock_pair(dvp, false, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
3366 out:
3367         ndp->ni_dvp_seqc = vn_seqc_read_any(dvp);
3368         if (vp != NULL)
3369                 ndp->ni_vp_seqc = vn_seqc_read_any(vp);
3370         return (ERELOOKUP);
3371 }
3372
3373 static void
3374 jseg_write(struct ufsmount *ump,
3375         struct jseg *jseg,
3376         uint8_t *data)
3377 {
3378         struct jsegrec *rec;
3379
3380         rec = (struct jsegrec *)data;
3381         rec->jsr_seq = jseg->js_seq;
3382         rec->jsr_oldest = jseg->js_oldseq;
3383         rec->jsr_cnt = jseg->js_cnt;
3384         rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3385         rec->jsr_crc = 0;
3386         rec->jsr_time = ump->um_fs->fs_mtime;
3387 }
3388
3389 static inline void
3390 inoref_write(struct inoref *inoref,
3391         struct jseg *jseg,
3392         struct jrefrec *rec)
3393 {
3394
3395         inoref->if_jsegdep->jd_seg = jseg;
3396         rec->jr_ino = inoref->if_ino;
3397         rec->jr_parent = inoref->if_parent;
3398         rec->jr_nlink = inoref->if_nlink;
3399         rec->jr_mode = inoref->if_mode;
3400         rec->jr_diroff = inoref->if_diroff;
3401 }
3402
3403 static void
3404 jaddref_write(struct jaddref *jaddref,
3405         struct jseg *jseg,
3406         uint8_t *data)
3407 {
3408         struct jrefrec *rec;
3409
3410         rec = (struct jrefrec *)data;
3411         rec->jr_op = JOP_ADDREF;
3412         inoref_write(&jaddref->ja_ref, jseg, rec);
3413 }
3414
3415 static void
3416 jremref_write(struct jremref *jremref,
3417         struct jseg *jseg,
3418         uint8_t *data)
3419 {
3420         struct jrefrec *rec;
3421
3422         rec = (struct jrefrec *)data;
3423         rec->jr_op = JOP_REMREF;
3424         inoref_write(&jremref->jr_ref, jseg, rec);
3425 }
3426
3427 static void
3428 jmvref_write(struct jmvref *jmvref,
3429         struct jseg *jseg,
3430         uint8_t *data)
3431 {
3432         struct jmvrec *rec;
3433
3434         rec = (struct jmvrec *)data;
3435         rec->jm_op = JOP_MVREF;
3436         rec->jm_ino = jmvref->jm_ino;
3437         rec->jm_parent = jmvref->jm_parent;
3438         rec->jm_oldoff = jmvref->jm_oldoff;
3439         rec->jm_newoff = jmvref->jm_newoff;
3440 }
3441
3442 static void
3443 jnewblk_write(struct jnewblk *jnewblk,
3444         struct jseg *jseg,
3445         uint8_t *data)
3446 {
3447         struct jblkrec *rec;
3448
3449         jnewblk->jn_jsegdep->jd_seg = jseg;
3450         rec = (struct jblkrec *)data;
3451         rec->jb_op = JOP_NEWBLK;
3452         rec->jb_ino = jnewblk->jn_ino;
3453         rec->jb_blkno = jnewblk->jn_blkno;
3454         rec->jb_lbn = jnewblk->jn_lbn;
3455         rec->jb_frags = jnewblk->jn_frags;
3456         rec->jb_oldfrags = jnewblk->jn_oldfrags;
3457 }
3458
3459 static void
3460 jfreeblk_write(struct jfreeblk *jfreeblk,
3461         struct jseg *jseg,
3462         uint8_t *data)
3463 {
3464         struct jblkrec *rec;
3465
3466         jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3467         rec = (struct jblkrec *)data;
3468         rec->jb_op = JOP_FREEBLK;
3469         rec->jb_ino = jfreeblk->jf_ino;
3470         rec->jb_blkno = jfreeblk->jf_blkno;
3471         rec->jb_lbn = jfreeblk->jf_lbn;
3472         rec->jb_frags = jfreeblk->jf_frags;
3473         rec->jb_oldfrags = 0;
3474 }
3475
3476 static void
3477 jfreefrag_write(struct jfreefrag *jfreefrag,
3478         struct jseg *jseg,
3479         uint8_t *data)
3480 {
3481         struct jblkrec *rec;
3482
3483         jfreefrag->fr_jsegdep->jd_seg = jseg;
3484         rec = (struct jblkrec *)data;
3485         rec->jb_op = JOP_FREEBLK;
3486         rec->jb_ino = jfreefrag->fr_ino;
3487         rec->jb_blkno = jfreefrag->fr_blkno;
3488         rec->jb_lbn = jfreefrag->fr_lbn;
3489         rec->jb_frags = jfreefrag->fr_frags;
3490         rec->jb_oldfrags = 0;
3491 }
3492
3493 static void
3494 jtrunc_write(struct jtrunc *jtrunc,
3495         struct jseg *jseg,
3496         uint8_t *data)
3497 {
3498         struct jtrncrec *rec;
3499
3500         jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3501         rec = (struct jtrncrec *)data;
3502         rec->jt_op = JOP_TRUNC;
3503         rec->jt_ino = jtrunc->jt_ino;
3504         rec->jt_size = jtrunc->jt_size;
3505         rec->jt_extsize = jtrunc->jt_extsize;
3506 }
3507
3508 static void
3509 jfsync_write(struct jfsync *jfsync,
3510         struct jseg *jseg,
3511         uint8_t *data)
3512 {
3513         struct jtrncrec *rec;
3514
3515         rec = (struct jtrncrec *)data;
3516         rec->jt_op = JOP_SYNC;
3517         rec->jt_ino = jfsync->jfs_ino;
3518         rec->jt_size = jfsync->jfs_size;
3519         rec->jt_extsize = jfsync->jfs_extsize;
3520 }
3521
3522 static void
3523 softdep_flushjournal(struct mount *mp)
3524 {
3525         struct jblocks *jblocks;
3526         struct ufsmount *ump;
3527
3528         if (MOUNTEDSUJ(mp) == 0)
3529                 return;
3530         ump = VFSTOUFS(mp);
3531         jblocks = ump->softdep_jblocks;
3532         ACQUIRE_LOCK(ump);
3533         while (ump->softdep_on_journal) {
3534                 jblocks->jb_needseg = 1;
3535                 softdep_process_journal(mp, NULL, MNT_WAIT);
3536         }
3537         FREE_LOCK(ump);
3538 }
3539
3540 static void softdep_synchronize_completed(struct bio *);
3541 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3542
3543 static void
3544 softdep_synchronize_completed(struct bio *bp)
3545 {
3546         struct jseg *oldest;
3547         struct jseg *jseg;
3548         struct ufsmount *ump;
3549
3550         /*
3551          * caller1 marks the last segment written before we issued the
3552          * synchronize cache.
3553          */
3554         jseg = bp->bio_caller1;
3555         if (jseg == NULL) {
3556                 g_destroy_bio(bp);
3557                 return;
3558         }
3559         ump = VFSTOUFS(jseg->js_list.wk_mp);
3560         ACQUIRE_LOCK(ump);
3561         oldest = NULL;
3562         /*
3563          * Mark all the journal entries waiting on the synchronize cache
3564          * as completed so they may continue on.
3565          */
3566         while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3567                 jseg->js_state |= COMPLETE;
3568                 oldest = jseg;
3569                 jseg = TAILQ_PREV(jseg, jseglst, js_next);
3570         }
3571         /*
3572          * Restart deferred journal entry processing from the oldest
3573          * completed jseg.
3574          */
3575         if (oldest)
3576                 complete_jsegs(oldest);
3577
3578         FREE_LOCK(ump);
3579         g_destroy_bio(bp);
3580 }
3581
3582 /*
3583  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3584  * barriers.  The journal must be written prior to any blocks that depend
3585  * on it and the journal can not be released until the blocks have be
3586  * written.  This code handles both barriers simultaneously.
3587  */
3588 static void
3589 softdep_synchronize(struct bio *bp,
3590         struct ufsmount *ump,
3591         void *caller1)
3592 {
3593
3594         bp->bio_cmd = BIO_FLUSH;
3595         bp->bio_flags |= BIO_ORDERED;
3596         bp->bio_data = NULL;
3597         bp->bio_offset = ump->um_cp->provider->mediasize;
3598         bp->bio_length = 0;
3599         bp->bio_done = softdep_synchronize_completed;
3600         bp->bio_caller1 = caller1;
3601         g_io_request(bp, ump->um_cp);
3602 }
3603
3604 /*
3605  * Flush some journal records to disk.
3606  */
3607 static void
3608 softdep_process_journal(struct mount *mp,
3609         struct worklist *needwk,
3610         int flags)
3611 {
3612         struct jblocks *jblocks;
3613         struct ufsmount *ump;
3614         struct worklist *wk;
3615         struct jseg *jseg;
3616         struct buf *bp;
3617         struct bio *bio;
3618         uint8_t *data;
3619         struct fs *fs;
3620         int shouldflush;
3621         int segwritten;
3622         int jrecmin;    /* Minimum records per block. */
3623         int jrecmax;    /* Maximum records per block. */
3624         int size;
3625         int cnt;
3626         int off;
3627         int devbsize;
3628
3629         ump = VFSTOUFS(mp);
3630         if (ump->um_softdep == NULL || ump->um_softdep->sd_jblocks == NULL)
3631                 return;
3632         shouldflush = softdep_flushcache;
3633         bio = NULL;
3634         jseg = NULL;
3635         LOCK_OWNED(ump);
3636         fs = ump->um_fs;
3637         jblocks = ump->softdep_jblocks;
3638         devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3639         /*
3640          * We write anywhere between a disk block and fs block.  The upper
3641          * bound is picked to prevent buffer cache fragmentation and limit
3642          * processing time per I/O.
3643          */
3644         jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3645         jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3646         segwritten = 0;
3647         for (;;) {
3648                 cnt = ump->softdep_on_journal;
3649                 /*
3650                  * Criteria for writing a segment:
3651                  * 1) We have a full block.
3652                  * 2) We're called from jwait() and haven't found the
3653                  *    journal item yet.
3654                  * 3) Always write if needseg is set.
3655                  * 4) If we are called from process_worklist and have
3656                  *    not yet written anything we write a partial block
3657                  *    to enforce a 1 second maximum latency on journal
3658                  *    entries.
3659                  */
3660                 if (cnt < (jrecmax - 1) && needwk == NULL &&
3661                     jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3662                         break;
3663                 cnt++;
3664                 /*
3665                  * Verify some free journal space.  softdep_prealloc() should
3666                  * guarantee that we don't run out so this is indicative of
3667                  * a problem with the flow control.  Try to recover
3668                  * gracefully in any event.
3669                  */
3670                 while (jblocks->jb_free == 0) {
3671                         if (flags != MNT_WAIT)
3672                                 break;
3673                         printf("softdep: Out of journal space!\n");
3674                         softdep_speedup(ump);
3675                         msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3676                 }
3677                 FREE_LOCK(ump);
3678                 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3679                 workitem_alloc(&jseg->js_list, D_JSEG, mp);
3680                 LIST_INIT(&jseg->js_entries);
3681                 LIST_INIT(&jseg->js_indirs);
3682                 jseg->js_state = ATTACHED;
3683                 if (shouldflush == 0)
3684                         jseg->js_state |= COMPLETE;
3685                 else if (bio == NULL)
3686                         bio = g_alloc_bio();
3687                 jseg->js_jblocks = jblocks;
3688                 bp = geteblk(fs->fs_bsize, 0);
3689                 ACQUIRE_LOCK(ump);
3690                 /*
3691                  * If there was a race while we were allocating the block
3692                  * and jseg the entry we care about was likely written.
3693                  * We bail out in both the WAIT and NOWAIT case and assume
3694                  * the caller will loop if the entry it cares about is
3695                  * not written.
3696                  */
3697                 cnt = ump->softdep_on_journal;
3698                 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3699                         bp->b_flags |= B_INVAL | B_NOCACHE;
3700                         WORKITEM_FREE(jseg, D_JSEG);
3701                         FREE_LOCK(ump);
3702                         brelse(bp);
3703                         ACQUIRE_LOCK(ump);
3704                         break;
3705                 }
3706                 /*
3707                  * Calculate the disk block size required for the available
3708                  * records rounded to the min size.
3709                  */
3710                 if (cnt == 0)
3711                         size = devbsize;
3712                 else if (cnt < jrecmax)
3713                         size = howmany(cnt, jrecmin) * devbsize;
3714                 else
3715                         size = fs->fs_bsize;
3716                 /*
3717                  * Allocate a disk block for this journal data and account
3718                  * for truncation of the requested size if enough contiguous
3719                  * space was not available.
3720                  */
3721                 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3722                 bp->b_lblkno = bp->b_blkno;
3723                 bp->b_offset = bp->b_blkno * DEV_BSIZE;
3724                 bp->b_bcount = size;
3725                 bp->b_flags &= ~B_INVAL;
3726                 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3727                 /*
3728                  * Initialize our jseg with cnt records.  Assign the next
3729                  * sequence number to it and link it in-order.
3730                  */
3731                 cnt = MIN(cnt, (size / devbsize) * jrecmin);
3732                 jseg->js_buf = bp;
3733                 jseg->js_cnt = cnt;
3734                 jseg->js_refs = cnt + 1;        /* Self ref. */
3735                 jseg->js_size = size;
3736                 jseg->js_seq = jblocks->jb_nextseq++;
3737                 if (jblocks->jb_oldestseg == NULL)
3738                         jblocks->jb_oldestseg = jseg;
3739                 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3740                 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3741                 if (jblocks->jb_writeseg == NULL)
3742                         jblocks->jb_writeseg = jseg;
3743                 /*
3744                  * Start filling in records from the pending list.
3745                  */
3746                 data = bp->b_data;
3747                 off = 0;
3748
3749                 /*
3750                  * Always put a header on the first block.
3751                  * XXX As with below, there might not be a chance to get
3752                  * into the loop.  Ensure that something valid is written.
3753                  */
3754                 jseg_write(ump, jseg, data);
3755                 off += JREC_SIZE;
3756                 data = bp->b_data + off;
3757
3758                 /*
3759                  * XXX Something is wrong here.  There's no work to do,
3760                  * but we need to perform and I/O and allow it to complete
3761                  * anyways.
3762                  */
3763                 if (LIST_EMPTY(&ump->softdep_journal_pending))
3764                         stat_emptyjblocks++;
3765
3766                 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3767                     != NULL) {
3768                         if (cnt == 0)
3769                                 break;
3770                         /* Place a segment header on every device block. */
3771                         if ((off % devbsize) == 0) {
3772                                 jseg_write(ump, jseg, data);
3773                                 off += JREC_SIZE;
3774                                 data = bp->b_data + off;
3775                         }
3776                         if (wk == needwk)
3777                                 needwk = NULL;
3778                         remove_from_journal(wk);
3779                         wk->wk_state |= INPROGRESS;
3780                         WORKLIST_INSERT(&jseg->js_entries, wk);
3781                         switch (wk->wk_type) {
3782                         case D_JADDREF:
3783                                 jaddref_write(WK_JADDREF(wk), jseg, data);
3784                                 break;
3785                         case D_JREMREF:
3786                                 jremref_write(WK_JREMREF(wk), jseg, data);
3787                                 break;
3788                         case D_JMVREF:
3789                                 jmvref_write(WK_JMVREF(wk), jseg, data);
3790                                 break;
3791                         case D_JNEWBLK:
3792                                 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3793                                 break;
3794                         case D_JFREEBLK:
3795                                 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3796                                 break;
3797                         case D_JFREEFRAG:
3798                                 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3799                                 break;
3800                         case D_JTRUNC:
3801                                 jtrunc_write(WK_JTRUNC(wk), jseg, data);
3802                                 break;
3803                         case D_JFSYNC:
3804                                 jfsync_write(WK_JFSYNC(wk), jseg, data);
3805                                 break;
3806                         default:
3807                                 panic("process_journal: Unknown type %s",
3808                                     TYPENAME(wk->wk_type));
3809                                 /* NOTREACHED */
3810                         }
3811                         off += JREC_SIZE;
3812                         data = bp->b_data + off;
3813                         cnt--;
3814                 }
3815
3816                 /* Clear any remaining space so we don't leak kernel data */
3817                 if (size > off)
3818                         bzero(data, size - off);
3819
3820                 /*
3821                  * Write this one buffer and continue.
3822                  */
3823                 segwritten = 1;
3824                 jblocks->jb_needseg = 0;
3825                 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3826                 FREE_LOCK(ump);
3827                 bp->b_xflags |= BX_CVTENXIO;
3828                 pbgetvp(ump->um_devvp, bp);
3829                 /*
3830                  * We only do the blocking wait once we find the journal
3831                  * entry we're looking for.
3832                  */
3833                 if (needwk == NULL && flags == MNT_WAIT)
3834                         bwrite(bp);
3835                 else
3836                         bawrite(bp);
3837                 ACQUIRE_LOCK(ump);
3838         }
3839         /*
3840          * If we wrote a segment issue a synchronize cache so the journal
3841          * is reflected on disk before the data is written.  Since reclaiming
3842          * journal space also requires writing a journal record this
3843          * process also enforces a barrier before reclamation.
3844          */
3845         if (segwritten && shouldflush) {
3846                 softdep_synchronize(bio, ump,
3847                     TAILQ_LAST(&jblocks->jb_segs, jseglst));
3848         } else if (bio)
3849                 g_destroy_bio(bio);
3850         /*
3851          * If we've suspended the filesystem because we ran out of journal
3852          * space either try to sync it here to make some progress or
3853          * unsuspend it if we already have.
3854          */
3855         if (flags == 0 && jblocks->jb_suspended) {
3856                 if (journal_unsuspend(ump))
3857                         return;
3858                 FREE_LOCK(ump);
3859                 VFS_SYNC(mp, MNT_NOWAIT);
3860                 ffs_sbupdate(ump, MNT_WAIT, 0);
3861                 ACQUIRE_LOCK(ump);
3862         }
3863 }
3864
3865 /*
3866  * Complete a jseg, allowing all dependencies awaiting journal writes
3867  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3868  * structures so that the journal segment can be freed to reclaim space.
3869  */
3870 static void
3871 complete_jseg(struct jseg *jseg)
3872 {
3873         struct worklist *wk;
3874         struct jmvref *jmvref;
3875 #ifdef INVARIANTS
3876         int i = 0;
3877 #endif
3878
3879         while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3880                 WORKLIST_REMOVE(wk);
3881                 wk->wk_state &= ~INPROGRESS;
3882                 wk->wk_state |= COMPLETE;
3883                 KASSERT(i++ < jseg->js_cnt,
3884                     ("handle_written_jseg: overflow %d >= %d",
3885                     i - 1, jseg->js_cnt));
3886                 switch (wk->wk_type) {
3887                 case D_JADDREF:
3888                         handle_written_jaddref(WK_JADDREF(wk));
3889                         break;
3890                 case D_JREMREF:
3891                         handle_written_jremref(WK_JREMREF(wk));
3892                         break;
3893                 case D_JMVREF:
3894                         rele_jseg(jseg);        /* No jsegdep. */
3895                         jmvref = WK_JMVREF(wk);
3896                         LIST_REMOVE(jmvref, jm_deps);
3897                         if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3898                                 free_pagedep(jmvref->jm_pagedep);
3899                         WORKITEM_FREE(jmvref, D_JMVREF);
3900                         break;
3901                 case D_JNEWBLK:
3902                         handle_written_jnewblk(WK_JNEWBLK(wk));
3903                         break;
3904                 case D_JFREEBLK:
3905                         handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3906                         break;
3907                 case D_JTRUNC:
3908                         handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3909                         break;
3910                 case D_JFSYNC:
3911                         rele_jseg(jseg);        /* No jsegdep. */
3912                         WORKITEM_FREE(wk, D_JFSYNC);
3913                         break;
3914                 case D_JFREEFRAG:
3915                         handle_written_jfreefrag(WK_JFREEFRAG(wk));
3916                         break;
3917                 default:
3918                         panic("handle_written_jseg: Unknown type %s",
3919                             TYPENAME(wk->wk_type));
3920                         /* NOTREACHED */
3921                 }
3922         }
3923         /* Release the self reference so the structure may be freed. */
3924         rele_jseg(jseg);
3925 }
3926
3927 /*
3928  * Determine which jsegs are ready for completion processing.  Waits for
3929  * synchronize cache to complete as well as forcing in-order completion
3930  * of journal entries.
3931  */
3932 static void
3933 complete_jsegs(struct jseg *jseg)
3934 {
3935         struct jblocks *jblocks;
3936         struct jseg *jsegn;
3937
3938         jblocks = jseg->js_jblocks;
3939         /*
3940          * Don't allow out of order completions.  If this isn't the first
3941          * block wait for it to write before we're done.
3942          */
3943         if (jseg != jblocks->jb_writeseg)
3944                 return;
3945         /* Iterate through available jsegs processing their entries. */
3946         while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3947                 jblocks->jb_oldestwrseq = jseg->js_oldseq;
3948                 jsegn = TAILQ_NEXT(jseg, js_next);
3949                 complete_jseg(jseg);
3950                 jseg = jsegn;
3951         }
3952         jblocks->jb_writeseg = jseg;
3953         /*
3954          * Attempt to free jsegs now that oldestwrseq may have advanced.
3955          */
3956         free_jsegs(jblocks);
3957 }
3958
3959 /*
3960  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3961  * the final completions.
3962  */
3963 static void
3964 handle_written_jseg(struct jseg *jseg, struct buf *bp)
3965 {
3966
3967         if (jseg->js_refs == 0)
3968                 panic("handle_written_jseg: No self-reference on %p", jseg);
3969         jseg->js_state |= DEPCOMPLETE;
3970         /*
3971          * We'll never need this buffer again, set flags so it will be
3972          * discarded.
3973          */
3974         bp->b_flags |= B_INVAL | B_NOCACHE;
3975         pbrelvp(bp);
3976         complete_jsegs(jseg);
3977 }
3978
3979 static inline struct jsegdep *
3980 inoref_jseg(struct inoref *inoref)
3981 {
3982         struct jsegdep *jsegdep;
3983
3984         jsegdep = inoref->if_jsegdep;
3985         inoref->if_jsegdep = NULL;
3986
3987         return (jsegdep);
3988 }
3989
3990 /*
3991  * Called once a jremref has made it to stable store.  The jremref is marked
3992  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3993  * for the jremref to complete will be awoken by free_jremref.
3994  */
3995 static void
3996 handle_written_jremref(struct jremref *jremref)
3997 {
3998         struct inodedep *inodedep;
3999         struct jsegdep *jsegdep;
4000         struct dirrem *dirrem;
4001
4002         /* Grab the jsegdep. */
4003         jsegdep = inoref_jseg(&jremref->jr_ref);
4004         /*
4005          * Remove us from the inoref list.
4006          */
4007         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
4008             0, &inodedep) == 0)
4009                 panic("handle_written_jremref: Lost inodedep");
4010         TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
4011         /*
4012          * Complete the dirrem.
4013          */
4014         dirrem = jremref->jr_dirrem;
4015         jremref->jr_dirrem = NULL;
4016         LIST_REMOVE(jremref, jr_deps);
4017         jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
4018         jwork_insert(&dirrem->dm_jwork, jsegdep);
4019         if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
4020             (dirrem->dm_state & COMPLETE) != 0)
4021                 add_to_worklist(&dirrem->dm_list, 0);
4022         free_jremref(jremref);
4023 }
4024
4025 /*
4026  * Called once a jaddref has made it to stable store.  The dependency is
4027  * marked complete and any dependent structures are added to the inode
4028  * bufwait list to be completed as soon as it is written.  If a bitmap write
4029  * depends on this entry we move the inode into the inodedephd of the
4030  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
4031  */
4032 static void
4033 handle_written_jaddref(struct jaddref *jaddref)
4034 {
4035         struct jsegdep *jsegdep;
4036         struct inodedep *inodedep;
4037         struct diradd *diradd;
4038         struct mkdir *mkdir;
4039
4040         /* Grab the jsegdep. */
4041         jsegdep = inoref_jseg(&jaddref->ja_ref);
4042         mkdir = NULL;
4043         diradd = NULL;
4044         if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4045             0, &inodedep) == 0)
4046                 panic("handle_written_jaddref: Lost inodedep.");
4047         if (jaddref->ja_diradd == NULL)
4048                 panic("handle_written_jaddref: No dependency");
4049         if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
4050                 diradd = jaddref->ja_diradd;
4051                 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
4052         } else if (jaddref->ja_state & MKDIR_PARENT) {
4053                 mkdir = jaddref->ja_mkdir;
4054                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
4055         } else if (jaddref->ja_state & MKDIR_BODY)
4056                 mkdir = jaddref->ja_mkdir;
4057         else
4058                 panic("handle_written_jaddref: Unknown dependency %p",
4059                     jaddref->ja_diradd);
4060         jaddref->ja_diradd = NULL;      /* also clears ja_mkdir */
4061         /*
4062          * Remove us from the inode list.
4063          */
4064         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
4065         /*
4066          * The mkdir may be waiting on the jaddref to clear before freeing.
4067          */
4068         if (mkdir) {
4069                 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
4070                     ("handle_written_jaddref: Incorrect type for mkdir %s",
4071                     TYPENAME(mkdir->md_list.wk_type)));
4072                 mkdir->md_jaddref = NULL;
4073                 diradd = mkdir->md_diradd;
4074                 mkdir->md_state |= DEPCOMPLETE;
4075                 complete_mkdir(mkdir);
4076         }
4077         jwork_insert(&diradd->da_jwork, jsegdep);
4078         if (jaddref->ja_state & NEWBLOCK) {
4079                 inodedep->id_state |= ONDEPLIST;
4080                 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
4081                     inodedep, id_deps);
4082         }
4083         free_jaddref(jaddref);
4084 }
4085
4086 /*
4087  * Called once a jnewblk journal is written.  The allocdirect or allocindir
4088  * is placed in the bmsafemap to await notification of a written bitmap.  If
4089  * the operation was canceled we add the segdep to the appropriate
4090  * dependency to free the journal space once the canceling operation
4091  * completes.
4092  */
4093 static void
4094 handle_written_jnewblk(struct jnewblk *jnewblk)
4095 {
4096         struct bmsafemap *bmsafemap;
4097         struct freefrag *freefrag;
4098         struct freework *freework;
4099         struct jsegdep *jsegdep;
4100         struct newblk *newblk;
4101
4102         /* Grab the jsegdep. */
4103         jsegdep = jnewblk->jn_jsegdep;
4104         jnewblk->jn_jsegdep = NULL;
4105         if (jnewblk->jn_dep == NULL)
4106                 panic("handle_written_jnewblk: No dependency for the segdep.");
4107         switch (jnewblk->jn_dep->wk_type) {
4108         case D_NEWBLK:
4109         case D_ALLOCDIRECT:
4110         case D_ALLOCINDIR:
4111                 /*
4112                  * Add the written block to the bmsafemap so it can
4113                  * be notified when the bitmap is on disk.
4114                  */
4115                 newblk = WK_NEWBLK(jnewblk->jn_dep);
4116                 newblk->nb_jnewblk = NULL;
4117                 if ((newblk->nb_state & GOINGAWAY) == 0) {
4118                         bmsafemap = newblk->nb_bmsafemap;
4119                         newblk->nb_state |= ONDEPLIST;
4120                         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
4121                             nb_deps);
4122                 }
4123                 jwork_insert(&newblk->nb_jwork, jsegdep);
4124                 break;
4125         case D_FREEFRAG:
4126                 /*
4127                  * A newblock being removed by a freefrag when replaced by
4128                  * frag extension.
4129                  */
4130                 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
4131                 freefrag->ff_jdep = NULL;
4132                 jwork_insert(&freefrag->ff_jwork, jsegdep);
4133                 break;
4134         case D_FREEWORK:
4135                 /*
4136                  * A direct block was removed by truncate.
4137                  */
4138                 freework = WK_FREEWORK(jnewblk->jn_dep);
4139                 freework->fw_jnewblk = NULL;
4140                 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
4141                 break;
4142         default:
4143                 panic("handle_written_jnewblk: Unknown type %d.",
4144                     jnewblk->jn_dep->wk_type);
4145         }
4146         jnewblk->jn_dep = NULL;
4147         free_jnewblk(jnewblk);
4148 }
4149
4150 /*
4151  * Cancel a jfreefrag that won't be needed, probably due to colliding with
4152  * an in-flight allocation that has not yet been committed.  Divorce us
4153  * from the freefrag and mark it DEPCOMPLETE so that it may be added
4154  * to the worklist.
4155  */
4156 static void
4157 cancel_jfreefrag(struct jfreefrag *jfreefrag)
4158 {
4159         struct freefrag *freefrag;
4160
4161         if (jfreefrag->fr_jsegdep) {
4162                 free_jsegdep(jfreefrag->fr_jsegdep);
4163                 jfreefrag->fr_jsegdep = NULL;
4164         }
4165         freefrag = jfreefrag->fr_freefrag;
4166         jfreefrag->fr_freefrag = NULL;
4167         free_jfreefrag(jfreefrag);
4168         freefrag->ff_state |= DEPCOMPLETE;
4169         CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
4170 }
4171
4172 /*
4173  * Free a jfreefrag when the parent freefrag is rendered obsolete.
4174  */
4175 static void
4176 free_jfreefrag(struct jfreefrag *jfreefrag)
4177 {
4178
4179         if (jfreefrag->fr_state & INPROGRESS)
4180                 WORKLIST_REMOVE(&jfreefrag->fr_list);
4181         else if (jfreefrag->fr_state & ONWORKLIST)
4182                 remove_from_journal(&jfreefrag->fr_list);
4183         if (jfreefrag->fr_freefrag != NULL)
4184                 panic("free_jfreefrag:  Still attached to a freefrag.");
4185         WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
4186 }
4187
4188 /*
4189  * Called when the journal write for a jfreefrag completes.  The parent
4190  * freefrag is added to the worklist if this completes its dependencies.
4191  */
4192 static void
4193 handle_written_jfreefrag(struct jfreefrag *jfreefrag)
4194 {
4195         struct jsegdep *jsegdep;
4196         struct freefrag *freefrag;
4197
4198         /* Grab the jsegdep. */
4199         jsegdep = jfreefrag->fr_jsegdep;
4200         jfreefrag->fr_jsegdep = NULL;
4201         freefrag = jfreefrag->fr_freefrag;
4202         if (freefrag == NULL)
4203                 panic("handle_written_jfreefrag: No freefrag.");
4204         freefrag->ff_state |= DEPCOMPLETE;
4205         freefrag->ff_jdep = NULL;
4206         jwork_insert(&freefrag->ff_jwork, jsegdep);
4207         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
4208                 add_to_worklist(&freefrag->ff_list, 0);
4209         jfreefrag->fr_freefrag = NULL;
4210         free_jfreefrag(jfreefrag);
4211 }
4212
4213 /*
4214  * Called when the journal write for a jfreeblk completes.  The jfreeblk
4215  * is removed from the freeblks list of pending journal writes and the
4216  * jsegdep is moved to the freeblks jwork to be completed when all blocks
4217  * have been reclaimed.
4218  */
4219 static void
4220 handle_written_jblkdep(struct jblkdep *jblkdep)
4221 {
4222         struct freeblks *freeblks;
4223         struct jsegdep *jsegdep;
4224
4225         /* Grab the jsegdep. */
4226         jsegdep = jblkdep->jb_jsegdep;
4227         jblkdep->jb_jsegdep = NULL;
4228         freeblks = jblkdep->jb_freeblks;
4229         LIST_REMOVE(jblkdep, jb_deps);
4230         jwork_insert(&freeblks->fb_jwork, jsegdep);
4231         /*
4232          * If the freeblks is all journaled, we can add it to the worklist.
4233          */
4234         if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
4235             (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
4236                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
4237
4238         free_jblkdep(jblkdep);
4239 }
4240
4241 static struct jsegdep *
4242 newjsegdep(struct worklist *wk)
4243 {
4244         struct jsegdep *jsegdep;
4245
4246         jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
4247         workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
4248         jsegdep->jd_seg = NULL;
4249
4250         return (jsegdep);
4251 }
4252
4253 static struct jmvref *
4254 newjmvref(struct inode *dp,
4255         ino_t ino,
4256         off_t oldoff,
4257         off_t newoff)
4258 {
4259         struct jmvref *jmvref;
4260
4261         jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
4262         workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
4263         jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4264         jmvref->jm_parent = dp->i_number;
4265         jmvref->jm_ino = ino;
4266         jmvref->jm_oldoff = oldoff;
4267         jmvref->jm_newoff = newoff;
4268
4269         return (jmvref);
4270 }
4271
4272 /*
4273  * Allocate a new jremref that tracks the removal of ip from dp with the
4274  * directory entry offset of diroff.  Mark the entry as ATTACHED and
4275  * DEPCOMPLETE as we have all the information required for the journal write
4276  * and the directory has already been removed from the buffer.  The caller
4277  * is responsible for linking the jremref into the pagedep and adding it
4278  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4279  * a DOTDOT addition so handle_workitem_remove() can properly assign
4280  * the jsegdep when we're done.
4281  */
4282 static struct jremref *
4283 newjremref(struct dirrem *dirrem,
4284         struct inode *dp,
4285         struct inode *ip,
4286         off_t diroff,
4287         nlink_t nlink)
4288 {
4289         struct jremref *jremref;
4290
4291         jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4292         workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4293         jremref->jr_state = ATTACHED;
4294         newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4295            nlink, ip->i_mode);
4296         jremref->jr_dirrem = dirrem;
4297
4298         return (jremref);
4299 }
4300
4301 static inline void
4302 newinoref(struct inoref *inoref,
4303         ino_t ino,
4304         ino_t parent,
4305         off_t diroff,
4306         nlink_t nlink,
4307         uint16_t mode)
4308 {
4309
4310         inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4311         inoref->if_diroff = diroff;
4312         inoref->if_ino = ino;
4313         inoref->if_parent = parent;
4314         inoref->if_nlink = nlink;
4315         inoref->if_mode = mode;
4316 }
4317
4318 /*
4319  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4320  * directory offset may not be known until later.  The caller is responsible
4321  * adding the entry to the journal when this information is available.  nlink
4322  * should be the link count prior to the addition and mode is only required
4323  * to have the correct FMT.
4324  */
4325 static struct jaddref *
4326 newjaddref(struct inode *dp,
4327         ino_t ino,
4328         off_t diroff,
4329         int16_t nlink,
4330         uint16_t mode)
4331 {
4332         struct jaddref *jaddref;
4333
4334         jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4335         workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4336         jaddref->ja_state = ATTACHED;
4337         jaddref->ja_mkdir = NULL;
4338         newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4339
4340         return (jaddref);
4341 }
4342
4343 /*
4344  * Create a new free dependency for a freework.  The caller is responsible
4345  * for adjusting the reference count when it has the lock held.  The freedep
4346  * will track an outstanding bitmap write that will ultimately clear the
4347  * freework to continue.
4348  */
4349 static struct freedep *
4350 newfreedep(struct freework *freework)
4351 {
4352         struct freedep *freedep;
4353
4354         freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4355         workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4356         freedep->fd_freework = freework;
4357
4358         return (freedep);
4359 }
4360
4361 /*
4362  * Free a freedep structure once the buffer it is linked to is written.  If
4363  * this is the last reference to the freework schedule it for completion.
4364  */
4365 static void
4366 free_freedep(struct freedep *freedep)
4367 {
4368         struct freework *freework;
4369
4370         freework = freedep->fd_freework;
4371         freework->fw_freeblks->fb_cgwait--;
4372         if (--freework->fw_ref == 0)
4373                 freework_enqueue(freework);
4374         WORKITEM_FREE(freedep, D_FREEDEP);
4375 }
4376
4377 /*
4378  * Allocate a new freework structure that may be a level in an indirect
4379  * when parent is not NULL or a top level block when it is.  The top level
4380  * freework structures are allocated without the per-filesystem lock held
4381  * and before the freeblks is visible outside of softdep_setup_freeblocks().
4382  */
4383 static struct freework *
4384 newfreework(struct ufsmount *ump,
4385         struct freeblks *freeblks,
4386         struct freework *parent,
4387         ufs_lbn_t lbn,
4388         ufs2_daddr_t nb,
4389         int frags,
4390         int off,
4391         int journal)
4392 {
4393         struct freework *freework;
4394
4395         freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4396         workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4397         freework->fw_state = ATTACHED;
4398         freework->fw_jnewblk = NULL;
4399         freework->fw_freeblks = freeblks;
4400         freework->fw_parent = parent;
4401         freework->fw_lbn = lbn;
4402         freework->fw_blkno = nb;
4403         freework->fw_frags = frags;
4404         freework->fw_indir = NULL;
4405         freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
4406             lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
4407         freework->fw_start = freework->fw_off = off;
4408         if (journal)
4409                 newjfreeblk(freeblks, lbn, nb, frags);
4410         if (parent == NULL) {
4411                 ACQUIRE_LOCK(ump);
4412                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4413                 freeblks->fb_ref++;
4414                 FREE_LOCK(ump);
4415         }
4416
4417         return (freework);
4418 }
4419
4420 /*
4421  * Eliminate a jfreeblk for a block that does not need journaling.
4422  */
4423 static void
4424 cancel_jfreeblk(struct freeblks *freeblks, ufs2_daddr_t blkno)
4425 {
4426         struct jfreeblk *jfreeblk;
4427         struct jblkdep *jblkdep;
4428
4429         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4430                 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4431                         continue;
4432                 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4433                 if (jfreeblk->jf_blkno == blkno)
4434                         break;
4435         }
4436         if (jblkdep == NULL)
4437                 return;
4438         CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4439         free_jsegdep(jblkdep->jb_jsegdep);
4440         LIST_REMOVE(jblkdep, jb_deps);
4441         WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4442 }
4443
4444 /*
4445  * Allocate a new jfreeblk to journal top level block pointer when truncating
4446  * a file.  The caller must add this to the worklist when the per-filesystem
4447  * lock is held.
4448  */
4449 static struct jfreeblk *
4450 newjfreeblk(struct freeblks *freeblks,
4451         ufs_lbn_t lbn,
4452         ufs2_daddr_t blkno,
4453         int frags)
4454 {
4455         struct jfreeblk *jfreeblk;
4456
4457         jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4458         workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4459             freeblks->fb_list.wk_mp);
4460         jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4461         jfreeblk->jf_dep.jb_freeblks = freeblks;
4462         jfreeblk->jf_ino = freeblks->fb_inum;
4463         jfreeblk->jf_lbn = lbn;
4464         jfreeblk->jf_blkno = blkno;
4465         jfreeblk->jf_frags = frags;
4466         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4467
4468         return (jfreeblk);
4469 }
4470
4471 /*
4472  * The journal is only prepared to handle full-size block numbers, so we
4473  * have to adjust the record to reflect the change to a full-size block.
4474  * For example, suppose we have a block made up of fragments 8-15 and
4475  * want to free its last two fragments. We are given a request that says:
4476  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4477  * where frags are the number of fragments to free and oldfrags are the
4478  * number of fragments to keep. To block align it, we have to change it to
4479  * have a valid full-size blkno, so it becomes:
4480  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4481  */
4482 static void
4483 adjust_newfreework(struct freeblks *freeblks, int frag_offset)
4484 {
4485         struct jfreeblk *jfreeblk;
4486
4487         KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4488             LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4489             ("adjust_newfreework: Missing freeblks dependency"));
4490
4491         jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4492         jfreeblk->jf_blkno -= frag_offset;
4493         jfreeblk->jf_frags += frag_offset;
4494 }
4495
4496 /*
4497  * Allocate a new jtrunc to track a partial truncation.
4498  */
4499 static struct jtrunc *
4500 newjtrunc(struct freeblks *freeblks,
4501         off_t size,
4502         int extsize)
4503 {
4504         struct jtrunc *jtrunc;
4505
4506         jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4507         workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4508             freeblks->fb_list.wk_mp);
4509         jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4510         jtrunc->jt_dep.jb_freeblks = freeblks;
4511         jtrunc->jt_ino = freeblks->fb_inum;
4512         jtrunc->jt_size = size;
4513         jtrunc->jt_extsize = extsize;
4514         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4515
4516         return (jtrunc);
4517 }
4518
4519 /*
4520  * If we're canceling a new bitmap we have to search for another ref
4521  * to move into the bmsafemap dep.  This might be better expressed
4522  * with another structure.
4523  */
4524 static void
4525 move_newblock_dep(struct jaddref *jaddref, struct inodedep *inodedep)
4526 {
4527         struct inoref *inoref;
4528         struct jaddref *jaddrefn;
4529
4530         jaddrefn = NULL;
4531         for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4532             inoref = TAILQ_NEXT(inoref, if_deps)) {
4533                 if ((jaddref->ja_state & NEWBLOCK) &&
4534                     inoref->if_list.wk_type == D_JADDREF) {
4535                         jaddrefn = (struct jaddref *)inoref;
4536                         break;
4537                 }
4538         }
4539         if (jaddrefn == NULL)
4540                 return;
4541         jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4542         jaddrefn->ja_state |= jaddref->ja_state &
4543             (ATTACHED | UNDONE | NEWBLOCK);
4544         jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4545         jaddref->ja_state |= ATTACHED;
4546         LIST_REMOVE(jaddref, ja_bmdeps);
4547         LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4548             ja_bmdeps);
4549 }
4550
4551 /*
4552  * Cancel a jaddref either before it has been written or while it is being
4553  * written.  This happens when a link is removed before the add reaches
4554  * the disk.  The jaddref dependency is kept linked into the bmsafemap
4555  * and inode to prevent the link count or bitmap from reaching the disk
4556  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4557  * required.
4558  *
4559  * Returns 1 if the canceled addref requires journaling of the remove and
4560  * 0 otherwise.
4561  */
4562 static int
4563 cancel_jaddref(struct jaddref *jaddref,
4564         struct inodedep *inodedep,
4565         struct workhead *wkhd)
4566 {
4567         struct inoref *inoref;
4568         struct jsegdep *jsegdep;
4569         int needsj;
4570
4571         KASSERT((jaddref->ja_state & COMPLETE) == 0,
4572             ("cancel_jaddref: Canceling complete jaddref"));
4573         if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4574                 needsj = 1;
4575         else
4576                 needsj = 0;
4577         if (inodedep == NULL)
4578                 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4579                     0, &inodedep) == 0)
4580                         panic("cancel_jaddref: Lost inodedep");
4581         /*
4582          * We must adjust the nlink of any reference operation that follows
4583          * us so that it is consistent with the in-memory reference.  This
4584          * ensures that inode nlink rollbacks always have the correct link.
4585          */
4586         if (needsj == 0) {
4587                 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4588                     inoref = TAILQ_NEXT(inoref, if_deps)) {
4589                         if (inoref->if_state & GOINGAWAY)
4590                                 break;
4591                         inoref->if_nlink--;
4592                 }
4593         }
4594         jsegdep = inoref_jseg(&jaddref->ja_ref);
4595         if (jaddref->ja_state & NEWBLOCK)
4596                 move_newblock_dep(jaddref, inodedep);
4597         wake_worklist(&jaddref->ja_list);
4598         jaddref->ja_mkdir = NULL;
4599         if (jaddref->ja_state & INPROGRESS) {
4600                 jaddref->ja_state &= ~INPROGRESS;
4601                 WORKLIST_REMOVE(&jaddref->ja_list);
4602                 jwork_insert(wkhd, jsegdep);
4603         } else {
4604                 free_jsegdep(jsegdep);
4605                 if (jaddref->ja_state & DEPCOMPLETE)
4606                         remove_from_journal(&jaddref->ja_list);
4607         }
4608         jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4609         /*
4610          * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4611          * can arrange for them to be freed with the bitmap.  Otherwise we
4612          * no longer need this addref attached to the inoreflst and it
4613          * will incorrectly adjust nlink if we leave it.
4614          */
4615         if ((jaddref->ja_state & NEWBLOCK) == 0) {
4616                 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4617                     if_deps);
4618                 jaddref->ja_state |= COMPLETE;
4619                 free_jaddref(jaddref);
4620                 return (needsj);
4621         }
4622         /*
4623          * Leave the head of the list for jsegdeps for fast merging.
4624          */
4625         if (LIST_FIRST(wkhd) != NULL) {
4626                 jaddref->ja_state |= ONWORKLIST;
4627                 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4628         } else
4629                 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4630
4631         return (needsj);
4632 }
4633
4634 /*
4635  * Attempt to free a jaddref structure when some work completes.  This
4636  * should only succeed once the entry is written and all dependencies have
4637  * been notified.
4638  */
4639 static void
4640 free_jaddref(struct jaddref *jaddref)
4641 {
4642
4643         if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4644                 return;
4645         if (jaddref->ja_ref.if_jsegdep)
4646                 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4647                     jaddref, jaddref->ja_state);
4648         if (jaddref->ja_state & NEWBLOCK)
4649                 LIST_REMOVE(jaddref, ja_bmdeps);
4650         if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4651                 panic("free_jaddref: Bad state %p(0x%X)",
4652                     jaddref, jaddref->ja_state);
4653         if (jaddref->ja_mkdir != NULL)
4654                 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4655         WORKITEM_FREE(jaddref, D_JADDREF);
4656 }
4657
4658 /*
4659  * Free a jremref structure once it has been written or discarded.
4660  */
4661 static void
4662 free_jremref(struct jremref *jremref)
4663 {
4664
4665         if (jremref->jr_ref.if_jsegdep)
4666                 free_jsegdep(jremref->jr_ref.if_jsegdep);
4667         if (jremref->jr_state & INPROGRESS)
4668                 panic("free_jremref: IO still pending");
4669         WORKITEM_FREE(jremref, D_JREMREF);
4670 }
4671
4672 /*
4673  * Free a jnewblk structure.
4674  */
4675 static void
4676 free_jnewblk(struct jnewblk *jnewblk)
4677 {
4678
4679         if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4680                 return;
4681         LIST_REMOVE(jnewblk, jn_deps);
4682         if (jnewblk->jn_dep != NULL)
4683                 panic("free_jnewblk: Dependency still attached.");
4684         WORKITEM_FREE(jnewblk, D_JNEWBLK);
4685 }
4686
4687 /*
4688  * Cancel a jnewblk which has been been made redundant by frag extension.
4689  */
4690 static void
4691 cancel_jnewblk(struct jnewblk *jnewblk, struct workhead *wkhd)
4692 {
4693         struct jsegdep *jsegdep;
4694
4695         CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4696         jsegdep = jnewblk->jn_jsegdep;
4697         if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4698                 panic("cancel_jnewblk: Invalid state");
4699         jnewblk->jn_jsegdep  = NULL;
4700         jnewblk->jn_dep = NULL;
4701         jnewblk->jn_state |= GOINGAWAY;
4702         if (jnewblk->jn_state & INPROGRESS) {
4703                 jnewblk->jn_state &= ~INPROGRESS;
4704                 WORKLIST_REMOVE(&jnewblk->jn_list);
4705                 jwork_insert(wkhd, jsegdep);
4706         } else {
4707                 free_jsegdep(jsegdep);
4708                 remove_from_journal(&jnewblk->jn_list);
4709         }
4710         wake_worklist(&jnewblk->jn_list);
4711         WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4712 }
4713
4714 static void
4715 free_jblkdep(struct jblkdep *jblkdep)
4716 {
4717
4718         if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4719                 WORKITEM_FREE(jblkdep, D_JFREEBLK);
4720         else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4721                 WORKITEM_FREE(jblkdep, D_JTRUNC);
4722         else
4723                 panic("free_jblkdep: Unexpected type %s",
4724                     TYPENAME(jblkdep->jb_list.wk_type));
4725 }
4726
4727 /*
4728  * Free a single jseg once it is no longer referenced in memory or on
4729  * disk.  Reclaim journal blocks and dependencies waiting for the segment
4730  * to disappear.
4731  */
4732 static void
4733 free_jseg(struct jseg *jseg, struct jblocks *jblocks)
4734 {
4735         struct freework *freework;
4736
4737         /*
4738          * Free freework structures that were lingering to indicate freed
4739          * indirect blocks that forced journal write ordering on reallocate.
4740          */
4741         while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4742                 indirblk_remove(freework);
4743         if (jblocks->jb_oldestseg == jseg)
4744                 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4745         TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4746         jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4747         KASSERT(LIST_EMPTY(&jseg->js_entries),
4748             ("free_jseg: Freed jseg has valid entries."));
4749         WORKITEM_FREE(jseg, D_JSEG);
4750 }
4751
4752 /*
4753  * Free all jsegs that meet the criteria for being reclaimed and update
4754  * oldestseg.
4755  */
4756 static void
4757 free_jsegs(struct jblocks *jblocks)
4758 {
4759         struct jseg *jseg;
4760
4761         /*
4762          * Free only those jsegs which have none allocated before them to
4763          * preserve the journal space ordering.
4764          */
4765         while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4766                 /*
4767                  * Only reclaim space when nothing depends on this journal
4768                  * set and another set has written that it is no longer
4769                  * valid.
4770                  */
4771                 if (jseg->js_refs != 0) {
4772                         jblocks->jb_oldestseg = jseg;
4773                         return;
4774                 }
4775                 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4776                         break;
4777                 if (jseg->js_seq > jblocks->jb_oldestwrseq)
4778                         break;
4779                 /*
4780                  * We can free jsegs that didn't write entries when
4781                  * oldestwrseq == js_seq.
4782                  */
4783                 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4784                     jseg->js_cnt != 0)
4785                         break;
4786                 free_jseg(jseg, jblocks);
4787         }
4788         /*
4789          * If we exited the loop above we still must discover the
4790          * oldest valid segment.
4791          */
4792         if (jseg)
4793                 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4794                      jseg = TAILQ_NEXT(jseg, js_next))
4795                         if (jseg->js_refs != 0)
4796                                 break;
4797         jblocks->jb_oldestseg = jseg;
4798         /*
4799          * The journal has no valid records but some jsegs may still be
4800          * waiting on oldestwrseq to advance.  We force a small record
4801          * out to permit these lingering records to be reclaimed.
4802          */
4803         if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4804                 jblocks->jb_needseg = 1;
4805 }
4806
4807 /*
4808  * Release one reference to a jseg and free it if the count reaches 0.  This
4809  * should eventually reclaim journal space as well.
4810  */
4811 static void
4812 rele_jseg(struct jseg *jseg)
4813 {
4814
4815         KASSERT(jseg->js_refs > 0,
4816             ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4817         if (--jseg->js_refs != 0)
4818                 return;
4819         free_jsegs(jseg->js_jblocks);
4820 }
4821
4822 /*
4823  * Release a jsegdep and decrement the jseg count.
4824  */
4825 static void
4826 free_jsegdep(struct jsegdep *jsegdep)
4827 {
4828
4829         if (jsegdep->jd_seg)
4830                 rele_jseg(jsegdep->jd_seg);
4831         WORKITEM_FREE(jsegdep, D_JSEGDEP);
4832 }
4833
4834 /*
4835  * Wait for a journal item to make it to disk.  Initiate journal processing
4836  * if required.
4837  */
4838 static int
4839 jwait(struct worklist *wk, int waitfor)
4840 {
4841
4842         LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4843         /*
4844          * Blocking journal waits cause slow synchronous behavior.  Record
4845          * stats on the frequency of these blocking operations.
4846          */
4847         if (waitfor == MNT_WAIT) {
4848                 stat_journal_wait++;
4849                 switch (wk->wk_type) {
4850                 case D_JREMREF:
4851                 case D_JMVREF:
4852                         stat_jwait_filepage++;
4853                         break;
4854                 case D_JTRUNC:
4855                 case D_JFREEBLK:
4856                         stat_jwait_freeblks++;
4857                         break;
4858                 case D_JNEWBLK:
4859                         stat_jwait_newblk++;
4860                         break;
4861                 case D_JADDREF:
4862                         stat_jwait_inode++;
4863                         break;
4864                 default:
4865                         break;
4866                 }
4867         }
4868         /*
4869          * If IO has not started we process the journal.  We can't mark the
4870          * worklist item as IOWAITING because we drop the lock while
4871          * processing the journal and the worklist entry may be freed after
4872          * this point.  The caller may call back in and re-issue the request.
4873          */
4874         if ((wk->wk_state & INPROGRESS) == 0) {
4875                 softdep_process_journal(wk->wk_mp, wk, waitfor);
4876                 if (waitfor != MNT_WAIT)
4877                         return (EBUSY);
4878                 return (0);
4879         }
4880         if (waitfor != MNT_WAIT)
4881                 return (EBUSY);
4882         wait_worklist(wk, "jwait");
4883         return (0);
4884 }
4885
4886 /*
4887  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4888  * appropriate.  This is a convenience function to reduce duplicate code
4889  * for the setup and revert functions below.
4890  */
4891 static struct inodedep *
4892 inodedep_lookup_ip(struct inode *ip)
4893 {
4894         struct inodedep *inodedep;
4895
4896         KASSERT(ip->i_nlink >= ip->i_effnlink,
4897             ("inodedep_lookup_ip: bad delta"));
4898         (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
4899             &inodedep);
4900         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4901         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4902
4903         return (inodedep);
4904 }
4905
4906 /*
4907  * Called prior to creating a new inode and linking it to a directory.  The
4908  * jaddref structure must already be allocated by softdep_setup_inomapdep
4909  * and it is discovered here so we can initialize the mode and update
4910  * nlinkdelta.
4911  */
4912 void
4913 softdep_setup_create(struct inode *dp, struct inode *ip)
4914 {
4915         struct inodedep *inodedep;
4916         struct jaddref *jaddref __diagused;
4917         struct vnode *dvp;
4918
4919         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4920             ("softdep_setup_create called on non-softdep filesystem"));
4921         KASSERT(ip->i_nlink == 1,
4922             ("softdep_setup_create: Invalid link count."));
4923         dvp = ITOV(dp);
4924         ACQUIRE_LOCK(ITOUMP(dp));
4925         inodedep = inodedep_lookup_ip(ip);
4926         if (DOINGSUJ(dvp)) {
4927                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4928                     inoreflst);
4929                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4930                     ("softdep_setup_create: No addref structure present."));
4931         }
4932         FREE_LOCK(ITOUMP(dp));
4933 }
4934
4935 /*
4936  * Create a jaddref structure to track the addition of a DOTDOT link when
4937  * we are reparenting an inode as part of a rename.  This jaddref will be
4938  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4939  * non-journaling softdep.
4940  */
4941 void
4942 softdep_setup_dotdot_link(struct inode *dp, struct inode *ip)
4943 {
4944         struct inodedep *inodedep;
4945         struct jaddref *jaddref;
4946         struct vnode *dvp;
4947
4948         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4949             ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4950         dvp = ITOV(dp);
4951         jaddref = NULL;
4952         /*
4953          * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4954          * is used as a normal link would be.
4955          */
4956         if (DOINGSUJ(dvp))
4957                 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4958                     dp->i_effnlink - 1, dp->i_mode);
4959         ACQUIRE_LOCK(ITOUMP(dp));
4960         inodedep = inodedep_lookup_ip(dp);
4961         if (jaddref)
4962                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4963                     if_deps);
4964         FREE_LOCK(ITOUMP(dp));
4965 }
4966
4967 /*
4968  * Create a jaddref structure to track a new link to an inode.  The directory
4969  * offset is not known until softdep_setup_directory_add or
4970  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4971  * softdep.
4972  */
4973 void
4974 softdep_setup_link(struct inode *dp, struct inode *ip)
4975 {
4976         struct inodedep *inodedep;
4977         struct jaddref *jaddref;
4978         struct vnode *dvp;
4979
4980         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4981             ("softdep_setup_link called on non-softdep filesystem"));
4982         dvp = ITOV(dp);
4983         jaddref = NULL;
4984         if (DOINGSUJ(dvp))
4985                 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4986                     ip->i_mode);
4987         ACQUIRE_LOCK(ITOUMP(dp));
4988         inodedep = inodedep_lookup_ip(ip);
4989         if (jaddref)
4990                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4991                     if_deps);
4992         FREE_LOCK(ITOUMP(dp));
4993 }
4994
4995 /*
4996  * Called to create the jaddref structures to track . and .. references as
4997  * well as lookup and further initialize the incomplete jaddref created
4998  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4999  * nlinkdelta for non-journaling softdep.
5000  */
5001 void
5002 softdep_setup_mkdir(struct inode *dp, struct inode *ip)
5003 {
5004         struct inodedep *inodedep;
5005         struct jaddref *dotdotaddref;
5006         struct jaddref *dotaddref;
5007         struct jaddref *jaddref;
5008         struct vnode *dvp;
5009
5010         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5011             ("softdep_setup_mkdir called on non-softdep filesystem"));
5012         dvp = ITOV(dp);
5013         dotaddref = dotdotaddref = NULL;
5014         if (DOINGSUJ(dvp)) {
5015                 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
5016                     ip->i_mode);
5017                 dotaddref->ja_state |= MKDIR_BODY;
5018                 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
5019                     dp->i_effnlink - 1, dp->i_mode);
5020                 dotdotaddref->ja_state |= MKDIR_PARENT;
5021         }
5022         ACQUIRE_LOCK(ITOUMP(dp));
5023         inodedep = inodedep_lookup_ip(ip);
5024         if (DOINGSUJ(dvp)) {
5025                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5026                     inoreflst);
5027                 KASSERT(jaddref != NULL,
5028                     ("softdep_setup_mkdir: No addref structure present."));
5029                 KASSERT(jaddref->ja_parent == dp->i_number,
5030                     ("softdep_setup_mkdir: bad parent %ju",
5031                     (uintmax_t)jaddref->ja_parent));
5032                 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
5033                     if_deps);
5034         }
5035         inodedep = inodedep_lookup_ip(dp);
5036         if (DOINGSUJ(dvp))
5037                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
5038                     &dotdotaddref->ja_ref, if_deps);
5039         FREE_LOCK(ITOUMP(dp));
5040 }
5041
5042 /*
5043  * Called to track nlinkdelta of the inode and parent directories prior to
5044  * unlinking a directory.
5045  */
5046 void
5047 softdep_setup_rmdir(struct inode *dp, struct inode *ip)
5048 {
5049
5050         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5051             ("softdep_setup_rmdir called on non-softdep filesystem"));
5052         ACQUIRE_LOCK(ITOUMP(dp));
5053         (void) inodedep_lookup_ip(ip);
5054         (void) inodedep_lookup_ip(dp);
5055         FREE_LOCK(ITOUMP(dp));
5056 }
5057
5058 /*
5059  * Called to track nlinkdelta of the inode and parent directories prior to
5060  * unlink.
5061  */
5062 void
5063 softdep_setup_unlink(struct inode *dp, struct inode *ip)
5064 {
5065
5066         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5067             ("softdep_setup_unlink called on non-softdep filesystem"));
5068         ACQUIRE_LOCK(ITOUMP(dp));
5069         (void) inodedep_lookup_ip(ip);
5070         (void) inodedep_lookup_ip(dp);
5071         FREE_LOCK(ITOUMP(dp));
5072 }
5073
5074 /*
5075  * Called to release the journal structures created by a failed non-directory
5076  * creation.  Adjusts nlinkdelta for non-journaling softdep.
5077  */
5078 void
5079 softdep_revert_create(struct inode *dp, struct inode *ip)
5080 {
5081         struct inodedep *inodedep;
5082         struct jaddref *jaddref;
5083         struct vnode *dvp;
5084
5085         KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
5086             ("softdep_revert_create called on non-softdep filesystem"));
5087         dvp = ITOV(dp);
5088         ACQUIRE_LOCK(ITOUMP(dp));
5089         inodedep = inodedep_lookup_ip(ip);
5090         if (DOINGSUJ(dvp)) {
5091                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5092                     inoreflst);
5093                 KASSERT(jaddref->ja_parent == dp->i_number,
5094                     ("softdep_revert_create: addref parent mismatch"));
5095                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5096         }
5097         FREE_LOCK(ITOUMP(dp));
5098 }
5099
5100 /*
5101  * Called to release the journal structures created by a failed link
5102  * addition.  Adjusts nlinkdelta for non-journaling softdep.
5103  */
5104 void
5105 softdep_revert_link(struct inode *dp, struct inode *ip)
5106 {
5107         struct inodedep *inodedep;
5108         struct jaddref *jaddref;
5109         struct vnode *dvp;
5110
5111         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5112             ("softdep_revert_link called on non-softdep filesystem"));
5113         dvp = ITOV(dp);
5114         ACQUIRE_LOCK(ITOUMP(dp));
5115         inodedep = inodedep_lookup_ip(ip);
5116         if (DOINGSUJ(dvp)) {
5117                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5118                     inoreflst);
5119                 KASSERT(jaddref->ja_parent == dp->i_number,
5120                     ("softdep_revert_link: addref parent mismatch"));
5121                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5122         }
5123         FREE_LOCK(ITOUMP(dp));
5124 }
5125
5126 /*
5127  * Called to release the journal structures created by a failed mkdir
5128  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
5129  */
5130 void
5131 softdep_revert_mkdir(struct inode *dp, struct inode *ip)
5132 {
5133         struct inodedep *inodedep;
5134         struct jaddref *jaddref;
5135         struct jaddref *dotaddref;
5136         struct vnode *dvp;
5137
5138         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5139             ("softdep_revert_mkdir called on non-softdep filesystem"));
5140         dvp = ITOV(dp);
5141
5142         ACQUIRE_LOCK(ITOUMP(dp));
5143         inodedep = inodedep_lookup_ip(dp);
5144         if (DOINGSUJ(dvp)) {
5145                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5146                     inoreflst);
5147                 KASSERT(jaddref->ja_parent == ip->i_number,
5148                     ("softdep_revert_mkdir: dotdot addref parent mismatch"));
5149                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5150         }
5151         inodedep = inodedep_lookup_ip(ip);
5152         if (DOINGSUJ(dvp)) {
5153                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5154                     inoreflst);
5155                 KASSERT(jaddref->ja_parent == dp->i_number,
5156                     ("softdep_revert_mkdir: addref parent mismatch"));
5157                 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
5158                     inoreflst, if_deps);
5159                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5160                 KASSERT(dotaddref->ja_parent == ip->i_number,
5161                     ("softdep_revert_mkdir: dot addref parent mismatch"));
5162                 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
5163         }
5164         FREE_LOCK(ITOUMP(dp));
5165 }
5166
5167 /*
5168  * Called to correct nlinkdelta after a failed rmdir.
5169  */
5170 void
5171 softdep_revert_rmdir(struct inode *dp, struct inode *ip)
5172 {
5173
5174         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5175             ("softdep_revert_rmdir called on non-softdep filesystem"));
5176         ACQUIRE_LOCK(ITOUMP(dp));
5177         (void) inodedep_lookup_ip(ip);
5178         (void) inodedep_lookup_ip(dp);
5179         FREE_LOCK(ITOUMP(dp));
5180 }
5181
5182 /*
5183  * Protecting the freemaps (or bitmaps).
5184  *
5185  * To eliminate the need to execute fsck before mounting a filesystem
5186  * after a power failure, one must (conservatively) guarantee that the
5187  * on-disk copy of the bitmaps never indicate that a live inode or block is
5188  * free.  So, when a block or inode is allocated, the bitmap should be
5189  * updated (on disk) before any new pointers.  When a block or inode is
5190  * freed, the bitmap should not be updated until all pointers have been
5191  * reset.  The latter dependency is handled by the delayed de-allocation
5192  * approach described below for block and inode de-allocation.  The former
5193  * dependency is handled by calling the following procedure when a block or
5194  * inode is allocated. When an inode is allocated an "inodedep" is created
5195  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
5196  * Each "inodedep" is also inserted into the hash indexing structure so
5197  * that any additional link additions can be made dependent on the inode
5198  * allocation.
5199  *
5200  * The ufs filesystem maintains a number of free block counts (e.g., per
5201  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
5202  * in addition to the bitmaps.  These counts are used to improve efficiency
5203  * during allocation and therefore must be consistent with the bitmaps.
5204  * There is no convenient way to guarantee post-crash consistency of these
5205  * counts with simple update ordering, for two main reasons: (1) The counts
5206  * and bitmaps for a single cylinder group block are not in the same disk
5207  * sector.  If a disk write is interrupted (e.g., by power failure), one may
5208  * be written and the other not.  (2) Some of the counts are located in the
5209  * superblock rather than the cylinder group block. So, we focus our soft
5210  * updates implementation on protecting the bitmaps. When mounting a
5211  * filesystem, we recompute the auxiliary counts from the bitmaps.
5212  */
5213
5214 /*
5215  * Called just after updating the cylinder group block to allocate an inode.
5216  */
5217 void
5218 softdep_setup_inomapdep(
5219         struct buf *bp,         /* buffer for cylgroup block with inode map */
5220         struct inode *ip,       /* inode related to allocation */
5221         ino_t newinum,          /* new inode number being allocated */
5222         int mode)
5223 {
5224         struct inodedep *inodedep;
5225         struct bmsafemap *bmsafemap;
5226         struct jaddref *jaddref;
5227         struct mount *mp;
5228         struct fs *fs;
5229
5230         mp = ITOVFS(ip);
5231         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5232             ("softdep_setup_inomapdep called on non-softdep filesystem"));
5233         fs = VFSTOUFS(mp)->um_fs;
5234         jaddref = NULL;
5235
5236         /*
5237          * Allocate the journal reference add structure so that the bitmap
5238          * can be dependent on it.
5239          */
5240         if (MOUNTEDSUJ(mp)) {
5241                 jaddref = newjaddref(ip, newinum, 0, 0, mode);
5242                 jaddref->ja_state |= NEWBLOCK;
5243         }
5244
5245         /*
5246          * Create a dependency for the newly allocated inode.
5247          * Panic if it already exists as something is seriously wrong.
5248          * Otherwise add it to the dependency list for the buffer holding
5249          * the cylinder group map from which it was allocated.
5250          *
5251          * We have to preallocate a bmsafemap entry in case it is needed
5252          * in bmsafemap_lookup since once we allocate the inodedep, we
5253          * have to finish initializing it before we can FREE_LOCK().
5254          * By preallocating, we avoid FREE_LOCK() while doing a malloc
5255          * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5256          * creating the inodedep as it can be freed during the time
5257          * that we FREE_LOCK() while allocating the inodedep. We must
5258          * call workitem_alloc() before entering the locked section as
5259          * it also acquires the lock and we must avoid trying doing so
5260          * recursively.
5261          */
5262         bmsafemap = malloc(sizeof(struct bmsafemap),
5263             M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5264         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5265         ACQUIRE_LOCK(ITOUMP(ip));
5266         if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5267                 panic("softdep_setup_inomapdep: dependency %p for new"
5268                     "inode already exists", inodedep);
5269         bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5270         if (jaddref) {
5271                 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5272                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5273                     if_deps);
5274         } else {
5275                 inodedep->id_state |= ONDEPLIST;
5276                 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5277         }
5278         inodedep->id_bmsafemap = bmsafemap;
5279         inodedep->id_state &= ~DEPCOMPLETE;
5280         FREE_LOCK(ITOUMP(ip));
5281 }
5282
5283 /*
5284  * Called just after updating the cylinder group block to
5285  * allocate block or fragment.
5286  */
5287 void
5288 softdep_setup_blkmapdep(
5289         struct buf *bp,         /* buffer for cylgroup block with block map */
5290         struct mount *mp,       /* filesystem doing allocation */
5291         ufs2_daddr_t newblkno,  /* number of newly allocated block */
5292         int frags,              /* Number of fragments. */
5293         int oldfrags)           /* Previous number of fragments for extend. */
5294 {
5295         struct newblk *newblk;
5296         struct bmsafemap *bmsafemap;
5297         struct jnewblk *jnewblk;
5298         struct ufsmount *ump;
5299         struct fs *fs;
5300
5301         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5302             ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5303         ump = VFSTOUFS(mp);
5304         fs = ump->um_fs;
5305         jnewblk = NULL;
5306         /*
5307          * Create a dependency for the newly allocated block.
5308          * Add it to the dependency list for the buffer holding
5309          * the cylinder group map from which it was allocated.
5310          */
5311         if (MOUNTEDSUJ(mp)) {
5312                 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5313                 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5314                 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5315                 jnewblk->jn_state = ATTACHED;
5316                 jnewblk->jn_blkno = newblkno;
5317                 jnewblk->jn_frags = frags;
5318                 jnewblk->jn_oldfrags = oldfrags;
5319 #ifdef INVARIANTS
5320                 {
5321                         struct cg *cgp;
5322                         uint8_t *blksfree;
5323                         long bno;
5324                         int i;
5325
5326                         cgp = (struct cg *)bp->b_data;
5327                         blksfree = cg_blksfree(cgp);
5328                         bno = dtogd(fs, jnewblk->jn_blkno);
5329                         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5330                             i++) {
5331                                 if (isset(blksfree, bno + i))
5332                                         panic("softdep_setup_blkmapdep: "
5333                                             "free fragment %d from %d-%d "
5334                                             "state 0x%X dep %p", i,
5335                                             jnewblk->jn_oldfrags,
5336                                             jnewblk->jn_frags,
5337                                             jnewblk->jn_state,
5338                                             jnewblk->jn_dep);
5339                         }
5340                 }
5341 #endif
5342         }
5343
5344         CTR3(KTR_SUJ,
5345             "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5346             newblkno, frags, oldfrags);
5347         ACQUIRE_LOCK(ump);
5348         if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5349                 panic("softdep_setup_blkmapdep: found block");
5350         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5351             dtog(fs, newblkno), NULL);
5352         if (jnewblk) {
5353                 jnewblk->jn_dep = (struct worklist *)newblk;
5354                 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5355         } else {
5356                 newblk->nb_state |= ONDEPLIST;
5357                 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5358         }
5359         newblk->nb_bmsafemap = bmsafemap;
5360         newblk->nb_jnewblk = jnewblk;
5361         FREE_LOCK(ump);
5362 }
5363
5364 #define BMSAFEMAP_HASH(ump, cg) \
5365       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5366
5367 static int
5368 bmsafemap_find(
5369         struct bmsafemap_hashhead *bmsafemaphd,
5370         int cg,
5371         struct bmsafemap **bmsafemapp)
5372 {
5373         struct bmsafemap *bmsafemap;
5374
5375         LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5376                 if (bmsafemap->sm_cg == cg)
5377                         break;
5378         if (bmsafemap) {
5379                 *bmsafemapp = bmsafemap;
5380                 return (1);
5381         }
5382         *bmsafemapp = NULL;
5383
5384         return (0);
5385 }
5386
5387 /*
5388  * Find the bmsafemap associated with a cylinder group buffer.
5389  * If none exists, create one. The buffer must be locked when
5390  * this routine is called and this routine must be called with
5391  * the softdep lock held. To avoid giving up the lock while
5392  * allocating a new bmsafemap, a preallocated bmsafemap may be
5393  * provided. If it is provided but not needed, it is freed.
5394  */
5395 static struct bmsafemap *
5396 bmsafemap_lookup(struct mount *mp,
5397         struct buf *bp,
5398         int cg,
5399         struct bmsafemap *newbmsafemap)
5400 {
5401         struct bmsafemap_hashhead *bmsafemaphd;
5402         struct bmsafemap *bmsafemap, *collision;
5403         struct worklist *wk;
5404         struct ufsmount *ump;
5405
5406         ump = VFSTOUFS(mp);
5407         LOCK_OWNED(ump);
5408         KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5409         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5410                 if (wk->wk_type == D_BMSAFEMAP) {
5411                         if (newbmsafemap)
5412                                 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5413                         return (WK_BMSAFEMAP(wk));
5414                 }
5415         }
5416         bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5417         if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5418                 if (newbmsafemap)
5419                         WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5420                 return (bmsafemap);
5421         }
5422         if (newbmsafemap) {
5423                 bmsafemap = newbmsafemap;
5424         } else {
5425                 FREE_LOCK(ump);
5426                 bmsafemap = malloc(sizeof(struct bmsafemap),
5427                         M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5428                 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5429                 ACQUIRE_LOCK(ump);
5430         }
5431         bmsafemap->sm_buf = bp;
5432         LIST_INIT(&bmsafemap->sm_inodedephd);
5433         LIST_INIT(&bmsafemap->sm_inodedepwr);
5434         LIST_INIT(&bmsafemap->sm_newblkhd);
5435         LIST_INIT(&bmsafemap->sm_newblkwr);
5436         LIST_INIT(&bmsafemap->sm_jaddrefhd);
5437         LIST_INIT(&bmsafemap->sm_jnewblkhd);
5438         LIST_INIT(&bmsafemap->sm_freehd);
5439         LIST_INIT(&bmsafemap->sm_freewr);
5440         if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5441                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5442                 return (collision);
5443         }
5444         bmsafemap->sm_cg = cg;
5445         LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5446         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5447         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5448         return (bmsafemap);
5449 }
5450
5451 /*
5452  * Direct block allocation dependencies.
5453  *
5454  * When a new block is allocated, the corresponding disk locations must be
5455  * initialized (with zeros or new data) before the on-disk inode points to
5456  * them.  Also, the freemap from which the block was allocated must be
5457  * updated (on disk) before the inode's pointer. These two dependencies are
5458  * independent of each other and are needed for all file blocks and indirect
5459  * blocks that are pointed to directly by the inode.  Just before the
5460  * "in-core" version of the inode is updated with a newly allocated block
5461  * number, a procedure (below) is called to setup allocation dependency
5462  * structures.  These structures are removed when the corresponding
5463  * dependencies are satisfied or when the block allocation becomes obsolete
5464  * (i.e., the file is deleted, the block is de-allocated, or the block is a
5465  * fragment that gets upgraded).  All of these cases are handled in
5466  * procedures described later.
5467  *
5468  * When a file extension causes a fragment to be upgraded, either to a larger
5469  * fragment or to a full block, the on-disk location may change (if the
5470  * previous fragment could not simply be extended). In this case, the old
5471  * fragment must be de-allocated, but not until after the inode's pointer has
5472  * been updated. In most cases, this is handled by later procedures, which
5473  * will construct a "freefrag" structure to be added to the workitem queue
5474  * when the inode update is complete (or obsolete).  The main exception to
5475  * this is when an allocation occurs while a pending allocation dependency
5476  * (for the same block pointer) remains.  This case is handled in the main
5477  * allocation dependency setup procedure by immediately freeing the
5478  * unreferenced fragments.
5479  */
5480 void
5481 softdep_setup_allocdirect(
5482         struct inode *ip,       /* inode to which block is being added */
5483         ufs_lbn_t off,          /* block pointer within inode */
5484         ufs2_daddr_t newblkno,  /* disk block number being added */
5485         ufs2_daddr_t oldblkno,  /* previous block number, 0 unless frag */
5486         long newsize,           /* size of new block */
5487         long oldsize,           /* size of new block */
5488         struct buf *bp)         /* bp for allocated block */
5489 {
5490         struct allocdirect *adp, *oldadp;
5491         struct allocdirectlst *adphead;
5492         struct freefrag *freefrag;
5493         struct inodedep *inodedep;
5494         struct pagedep *pagedep;
5495         struct jnewblk *jnewblk;
5496         struct newblk *newblk;
5497         struct mount *mp;
5498         ufs_lbn_t lbn;
5499
5500         lbn = bp->b_lblkno;
5501         mp = ITOVFS(ip);
5502         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5503             ("softdep_setup_allocdirect called on non-softdep filesystem"));
5504         if (oldblkno && oldblkno != newblkno)
5505                 /*
5506                  * The usual case is that a smaller fragment that
5507                  * was just allocated has been replaced with a bigger
5508                  * fragment or a full-size block. If it is marked as
5509                  * B_DELWRI, the current contents have not been written
5510                  * to disk. It is possible that the block was written
5511                  * earlier, but very uncommon. If the block has never
5512                  * been written, there is no need to send a BIO_DELETE
5513                  * for it when it is freed. The gain from avoiding the
5514                  * TRIMs for the common case of unwritten blocks far
5515                  * exceeds the cost of the write amplification for the
5516                  * uncommon case of failing to send a TRIM for a block
5517                  * that had been written.
5518                  */
5519                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5520                     (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5521         else
5522                 freefrag = NULL;
5523
5524         CTR6(KTR_SUJ,
5525             "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5526             "off %jd newsize %ld oldsize %d",
5527             ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5528         ACQUIRE_LOCK(ITOUMP(ip));
5529         if (off >= UFS_NDADDR) {
5530                 if (lbn > 0)
5531                         panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5532                             lbn, off);
5533                 /* allocating an indirect block */
5534                 if (oldblkno != 0)
5535                         panic("softdep_setup_allocdirect: non-zero indir");
5536         } else {
5537                 if (off != lbn)
5538                         panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5539                             lbn, off);
5540                 /*
5541                  * Allocating a direct block.
5542                  *
5543                  * If we are allocating a directory block, then we must
5544                  * allocate an associated pagedep to track additions and
5545                  * deletions.
5546                  */
5547                 if ((ip->i_mode & IFMT) == IFDIR)
5548                         pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5549                             &pagedep);
5550         }
5551         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5552                 panic("softdep_setup_allocdirect: lost block");
5553         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5554             ("softdep_setup_allocdirect: newblk already initialized"));
5555         /*
5556          * Convert the newblk to an allocdirect.
5557          */
5558         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5559         adp = (struct allocdirect *)newblk;
5560         newblk->nb_freefrag = freefrag;
5561         adp->ad_offset = off;
5562         adp->ad_oldblkno = oldblkno;
5563         adp->ad_newsize = newsize;
5564         adp->ad_oldsize = oldsize;
5565
5566         /*
5567          * Finish initializing the journal.
5568          */
5569         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5570                 jnewblk->jn_ino = ip->i_number;
5571                 jnewblk->jn_lbn = lbn;
5572                 add_to_journal(&jnewblk->jn_list);
5573         }
5574         if (freefrag && freefrag->ff_jdep != NULL &&
5575             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5576                 add_to_journal(freefrag->ff_jdep);
5577         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5578         adp->ad_inodedep = inodedep;
5579
5580         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5581         /*
5582          * The list of allocdirects must be kept in sorted and ascending
5583          * order so that the rollback routines can quickly determine the
5584          * first uncommitted block (the size of the file stored on disk
5585          * ends at the end of the lowest committed fragment, or if there
5586          * are no fragments, at the end of the highest committed block).
5587          * Since files generally grow, the typical case is that the new
5588          * block is to be added at the end of the list. We speed this
5589          * special case by checking against the last allocdirect in the
5590          * list before laboriously traversing the list looking for the
5591          * insertion point.
5592          */
5593         adphead = &inodedep->id_newinoupdt;
5594         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5595         if (oldadp == NULL || oldadp->ad_offset <= off) {
5596                 /* insert at end of list */
5597                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5598                 if (oldadp != NULL && oldadp->ad_offset == off)
5599                         allocdirect_merge(adphead, adp, oldadp);
5600                 FREE_LOCK(ITOUMP(ip));
5601                 return;
5602         }
5603         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5604                 if (oldadp->ad_offset >= off)
5605                         break;
5606         }
5607         if (oldadp == NULL)
5608                 panic("softdep_setup_allocdirect: lost entry");
5609         /* insert in middle of list */
5610         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5611         if (oldadp->ad_offset == off)
5612                 allocdirect_merge(adphead, adp, oldadp);
5613
5614         FREE_LOCK(ITOUMP(ip));
5615 }
5616
5617 /*
5618  * Merge a newer and older journal record to be stored either in a
5619  * newblock or freefrag.  This handles aggregating journal records for
5620  * fragment allocation into a second record as well as replacing a
5621  * journal free with an aborted journal allocation.  A segment for the
5622  * oldest record will be placed on wkhd if it has been written.  If not
5623  * the segment for the newer record will suffice.
5624  */
5625 static struct worklist *
5626 jnewblk_merge(struct worklist *new,
5627         struct worklist *old,
5628         struct workhead *wkhd)
5629 {
5630         struct jnewblk *njnewblk;
5631         struct jnewblk *jnewblk;
5632
5633         /* Handle NULLs to simplify callers. */
5634         if (new == NULL)
5635                 return (old);
5636         if (old == NULL)
5637                 return (new);
5638         /* Replace a jfreefrag with a jnewblk. */
5639         if (new->wk_type == D_JFREEFRAG) {
5640                 if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5641                         panic("jnewblk_merge: blkno mismatch: %p, %p",
5642                             old, new);
5643                 cancel_jfreefrag(WK_JFREEFRAG(new));
5644                 return (old);
5645         }
5646         if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5647                 panic("jnewblk_merge: Bad type: old %d new %d\n",
5648                     old->wk_type, new->wk_type);
5649         /*
5650          * Handle merging of two jnewblk records that describe
5651          * different sets of fragments in the same block.
5652          */
5653         jnewblk = WK_JNEWBLK(old);
5654         njnewblk = WK_JNEWBLK(new);
5655         if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5656                 panic("jnewblk_merge: Merging disparate blocks.");
5657         /*
5658          * The record may be rolled back in the cg.
5659          */
5660         if (jnewblk->jn_state & UNDONE) {
5661                 jnewblk->jn_state &= ~UNDONE;
5662                 njnewblk->jn_state |= UNDONE;
5663                 njnewblk->jn_state &= ~ATTACHED;
5664         }
5665         /*
5666          * We modify the newer addref and free the older so that if neither
5667          * has been written the most up-to-date copy will be on disk.  If
5668          * both have been written but rolled back we only temporarily need
5669          * one of them to fix the bits when the cg write completes.
5670          */
5671         jnewblk->jn_state |= ATTACHED | COMPLETE;
5672         njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5673         cancel_jnewblk(jnewblk, wkhd);
5674         WORKLIST_REMOVE(&jnewblk->jn_list);
5675         free_jnewblk(jnewblk);
5676         return (new);
5677 }
5678
5679 /*
5680  * Replace an old allocdirect dependency with a newer one.
5681  */
5682 static void
5683 allocdirect_merge(
5684         struct allocdirectlst *adphead, /* head of list holding allocdirects */
5685         struct allocdirect *newadp,     /* allocdirect being added */
5686         struct allocdirect *oldadp)     /* existing allocdirect being checked */
5687 {
5688         struct worklist *wk;
5689         struct freefrag *freefrag;
5690
5691         freefrag = NULL;
5692         LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5693         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5694             newadp->ad_oldsize != oldadp->ad_newsize ||
5695             newadp->ad_offset >= UFS_NDADDR)
5696                 panic("%s %jd != new %jd || old size %ld != new %ld",
5697                     "allocdirect_merge: old blkno",
5698                     (intmax_t)newadp->ad_oldblkno,
5699                     (intmax_t)oldadp->ad_newblkno,
5700                     newadp->ad_oldsize, oldadp->ad_newsize);
5701         newadp->ad_oldblkno = oldadp->ad_oldblkno;
5702         newadp->ad_oldsize = oldadp->ad_oldsize;
5703         /*
5704          * If the old dependency had a fragment to free or had never
5705          * previously had a block allocated, then the new dependency
5706          * can immediately post its freefrag and adopt the old freefrag.
5707          * This action is done by swapping the freefrag dependencies.
5708          * The new dependency gains the old one's freefrag, and the
5709          * old one gets the new one and then immediately puts it on
5710          * the worklist when it is freed by free_newblk. It is
5711          * not possible to do this swap when the old dependency had a
5712          * non-zero size but no previous fragment to free. This condition
5713          * arises when the new block is an extension of the old block.
5714          * Here, the first part of the fragment allocated to the new
5715          * dependency is part of the block currently claimed on disk by
5716          * the old dependency, so cannot legitimately be freed until the
5717          * conditions for the new dependency are fulfilled.
5718          */
5719         freefrag = newadp->ad_freefrag;
5720         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5721                 newadp->ad_freefrag = oldadp->ad_freefrag;
5722                 oldadp->ad_freefrag = freefrag;
5723         }
5724         /*
5725          * If we are tracking a new directory-block allocation,
5726          * move it from the old allocdirect to the new allocdirect.
5727          */
5728         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5729                 WORKLIST_REMOVE(wk);
5730                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5731                         panic("allocdirect_merge: extra newdirblk");
5732                 WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5733         }
5734         TAILQ_REMOVE(adphead, oldadp, ad_next);
5735         /*
5736          * We need to move any journal dependencies over to the freefrag
5737          * that releases this block if it exists.  Otherwise we are
5738          * extending an existing block and we'll wait until that is
5739          * complete to release the journal space and extend the
5740          * new journal to cover this old space as well.
5741          */
5742         if (freefrag == NULL) {
5743                 if (oldadp->ad_newblkno != newadp->ad_newblkno)
5744                         panic("allocdirect_merge: %jd != %jd",
5745                             oldadp->ad_newblkno, newadp->ad_newblkno);
5746                 newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5747                     jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5748                     &oldadp->ad_block.nb_jnewblk->jn_list,
5749                     &newadp->ad_block.nb_jwork);
5750                 oldadp->ad_block.nb_jnewblk = NULL;
5751                 cancel_newblk(&oldadp->ad_block, NULL,
5752                     &newadp->ad_block.nb_jwork);
5753         } else {
5754                 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5755                     &freefrag->ff_list, &freefrag->ff_jwork);
5756                 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5757                     &freefrag->ff_jwork);
5758         }
5759         free_newblk(&oldadp->ad_block);
5760 }
5761
5762 /*
5763  * Allocate a jfreefrag structure to journal a single block free.
5764  */
5765 static struct jfreefrag *
5766 newjfreefrag(struct freefrag *freefrag,
5767         struct inode *ip,
5768         ufs2_daddr_t blkno,
5769         long size,
5770         ufs_lbn_t lbn)
5771 {
5772         struct jfreefrag *jfreefrag;
5773         struct fs *fs;
5774
5775         fs = ITOFS(ip);
5776         jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5777             M_SOFTDEP_FLAGS);
5778         workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
5779         jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5780         jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5781         jfreefrag->fr_ino = ip->i_number;
5782         jfreefrag->fr_lbn = lbn;
5783         jfreefrag->fr_blkno = blkno;
5784         jfreefrag->fr_frags = numfrags(fs, size);
5785         jfreefrag->fr_freefrag = freefrag;
5786
5787         return (jfreefrag);
5788 }
5789
5790 /*
5791  * Allocate a new freefrag structure.
5792  */
5793 static struct freefrag *
5794 newfreefrag(struct inode *ip,
5795         ufs2_daddr_t blkno,
5796         long size,
5797         ufs_lbn_t lbn,
5798         uint64_t key)
5799 {
5800         struct freefrag *freefrag;
5801         struct ufsmount *ump;
5802         struct fs *fs;
5803
5804         CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5805             ip->i_number, blkno, size, lbn);
5806         ump = ITOUMP(ip);
5807         fs = ump->um_fs;
5808         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5809                 panic("newfreefrag: frag size");
5810         freefrag = malloc(sizeof(struct freefrag),
5811             M_FREEFRAG, M_SOFTDEP_FLAGS);
5812         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
5813         freefrag->ff_state = ATTACHED;
5814         LIST_INIT(&freefrag->ff_jwork);
5815         freefrag->ff_inum = ip->i_number;
5816         freefrag->ff_vtype = ITOV(ip)->v_type;
5817         freefrag->ff_blkno = blkno;
5818         freefrag->ff_fragsize = size;
5819         freefrag->ff_key = key;
5820
5821         if (MOUNTEDSUJ(UFSTOVFS(ump))) {
5822                 freefrag->ff_jdep = (struct worklist *)
5823                     newjfreefrag(freefrag, ip, blkno, size, lbn);
5824         } else {
5825                 freefrag->ff_state |= DEPCOMPLETE;
5826                 freefrag->ff_jdep = NULL;
5827         }
5828
5829         return (freefrag);
5830 }
5831
5832 /*
5833  * This workitem de-allocates fragments that were replaced during
5834  * file block allocation.
5835  */
5836 static void
5837 handle_workitem_freefrag(struct freefrag *freefrag)
5838 {
5839         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5840         struct workhead wkhd;
5841
5842         CTR3(KTR_SUJ,
5843             "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5844             freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5845         /*
5846          * It would be illegal to add new completion items to the
5847          * freefrag after it was schedule to be done so it must be
5848          * safe to modify the list head here.
5849          */
5850         LIST_INIT(&wkhd);
5851         ACQUIRE_LOCK(ump);
5852         LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5853         /*
5854          * If the journal has not been written we must cancel it here.
5855          */
5856         if (freefrag->ff_jdep) {
5857                 if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5858                         panic("handle_workitem_freefrag: Unexpected type %d\n",
5859                             freefrag->ff_jdep->wk_type);
5860                 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5861         }
5862         FREE_LOCK(ump);
5863         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5864            freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
5865            &wkhd, freefrag->ff_key);
5866         ACQUIRE_LOCK(ump);
5867         WORKITEM_FREE(freefrag, D_FREEFRAG);
5868         FREE_LOCK(ump);
5869 }
5870
5871 /*
5872  * Set up a dependency structure for an external attributes data block.
5873  * This routine follows much of the structure of softdep_setup_allocdirect.
5874  * See the description of softdep_setup_allocdirect above for details.
5875  */
5876 void
5877 softdep_setup_allocext(
5878         struct inode *ip,
5879         ufs_lbn_t off,
5880         ufs2_daddr_t newblkno,
5881         ufs2_daddr_t oldblkno,
5882         long newsize,
5883         long oldsize,
5884         struct buf *bp)
5885 {
5886         struct allocdirect *adp, *oldadp;
5887         struct allocdirectlst *adphead;
5888         struct freefrag *freefrag;
5889         struct inodedep *inodedep;
5890         struct jnewblk *jnewblk;
5891         struct newblk *newblk;
5892         struct mount *mp;
5893         struct ufsmount *ump;
5894         ufs_lbn_t lbn;
5895
5896         mp = ITOVFS(ip);
5897         ump = VFSTOUFS(mp);
5898         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5899             ("softdep_setup_allocext called on non-softdep filesystem"));
5900         KASSERT(off < UFS_NXADDR,
5901             ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
5902
5903         lbn = bp->b_lblkno;
5904         if (oldblkno && oldblkno != newblkno)
5905                 /*
5906                  * The usual case is that a smaller fragment that
5907                  * was just allocated has been replaced with a bigger
5908                  * fragment or a full-size block. If it is marked as
5909                  * B_DELWRI, the current contents have not been written
5910                  * to disk. It is possible that the block was written
5911                  * earlier, but very uncommon. If the block has never
5912                  * been written, there is no need to send a BIO_DELETE
5913                  * for it when it is freed. The gain from avoiding the
5914                  * TRIMs for the common case of unwritten blocks far
5915                  * exceeds the cost of the write amplification for the
5916                  * uncommon case of failing to send a TRIM for a block
5917                  * that had been written.
5918                  */
5919                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5920                     (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5921         else
5922                 freefrag = NULL;
5923
5924         ACQUIRE_LOCK(ump);
5925         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5926                 panic("softdep_setup_allocext: lost block");
5927         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5928             ("softdep_setup_allocext: newblk already initialized"));
5929         /*
5930          * Convert the newblk to an allocdirect.
5931          */
5932         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5933         adp = (struct allocdirect *)newblk;
5934         newblk->nb_freefrag = freefrag;
5935         adp->ad_offset = off;
5936         adp->ad_oldblkno = oldblkno;
5937         adp->ad_newsize = newsize;
5938         adp->ad_oldsize = oldsize;
5939         adp->ad_state |=  EXTDATA;
5940
5941         /*
5942          * Finish initializing the journal.
5943          */
5944         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5945                 jnewblk->jn_ino = ip->i_number;
5946                 jnewblk->jn_lbn = lbn;
5947                 add_to_journal(&jnewblk->jn_list);
5948         }
5949         if (freefrag && freefrag->ff_jdep != NULL &&
5950             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5951                 add_to_journal(freefrag->ff_jdep);
5952         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5953         adp->ad_inodedep = inodedep;
5954
5955         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5956         /*
5957          * The list of allocdirects must be kept in sorted and ascending
5958          * order so that the rollback routines can quickly determine the
5959          * first uncommitted block (the size of the file stored on disk
5960          * ends at the end of the lowest committed fragment, or if there
5961          * are no fragments, at the end of the highest committed block).
5962          * Since files generally grow, the typical case is that the new
5963          * block is to be added at the end of the list. We speed this
5964          * special case by checking against the last allocdirect in the
5965          * list before laboriously traversing the list looking for the
5966          * insertion point.
5967          */
5968         adphead = &inodedep->id_newextupdt;
5969         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5970         if (oldadp == NULL || oldadp->ad_offset <= off) {
5971                 /* insert at end of list */
5972                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5973                 if (oldadp != NULL && oldadp->ad_offset == off)
5974                         allocdirect_merge(adphead, adp, oldadp);
5975                 FREE_LOCK(ump);
5976                 return;
5977         }
5978         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5979                 if (oldadp->ad_offset >= off)
5980                         break;
5981         }
5982         if (oldadp == NULL)
5983                 panic("softdep_setup_allocext: lost entry");
5984         /* insert in middle of list */
5985         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5986         if (oldadp->ad_offset == off)
5987                 allocdirect_merge(adphead, adp, oldadp);
5988         FREE_LOCK(ump);
5989 }
5990
5991 /*
5992  * Indirect block allocation dependencies.
5993  *
5994  * The same dependencies that exist for a direct block also exist when
5995  * a new block is allocated and pointed to by an entry in a block of
5996  * indirect pointers. The undo/redo states described above are also
5997  * used here. Because an indirect block contains many pointers that
5998  * may have dependencies, a second copy of the entire in-memory indirect
5999  * block is kept. The buffer cache copy is always completely up-to-date.
6000  * The second copy, which is used only as a source for disk writes,
6001  * contains only the safe pointers (i.e., those that have no remaining
6002  * update dependencies). The second copy is freed when all pointers
6003  * are safe. The cache is not allowed to replace indirect blocks with
6004  * pending update dependencies. If a buffer containing an indirect
6005  * block with dependencies is written, these routines will mark it
6006  * dirty again. It can only be successfully written once all the
6007  * dependencies are removed. The ffs_fsync routine in conjunction with
6008  * softdep_sync_metadata work together to get all the dependencies
6009  * removed so that a file can be successfully written to disk. Three
6010  * procedures are used when setting up indirect block pointer
6011  * dependencies. The division is necessary because of the organization
6012  * of the "balloc" routine and because of the distinction between file
6013  * pages and file metadata blocks.
6014  */
6015
6016 /*
6017  * Allocate a new allocindir structure.
6018  */
6019 static struct allocindir *
6020 newallocindir(
6021         struct inode *ip,       /* inode for file being extended */
6022         int ptrno,              /* offset of pointer in indirect block */
6023         ufs2_daddr_t newblkno,  /* disk block number being added */
6024         ufs2_daddr_t oldblkno,  /* previous block number, 0 if none */
6025         ufs_lbn_t lbn)
6026 {
6027         struct newblk *newblk;
6028         struct allocindir *aip;
6029         struct freefrag *freefrag;
6030         struct jnewblk *jnewblk;
6031
6032         if (oldblkno)
6033                 freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
6034                     SINGLETON_KEY);
6035         else
6036                 freefrag = NULL;
6037         ACQUIRE_LOCK(ITOUMP(ip));
6038         if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
6039                 panic("new_allocindir: lost block");
6040         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
6041             ("newallocindir: newblk already initialized"));
6042         WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
6043         newblk->nb_freefrag = freefrag;
6044         aip = (struct allocindir *)newblk;
6045         aip->ai_offset = ptrno;
6046         aip->ai_oldblkno = oldblkno;
6047         aip->ai_lbn = lbn;
6048         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
6049                 jnewblk->jn_ino = ip->i_number;
6050                 jnewblk->jn_lbn = lbn;
6051                 add_to_journal(&jnewblk->jn_list);
6052         }
6053         if (freefrag && freefrag->ff_jdep != NULL &&
6054             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
6055                 add_to_journal(freefrag->ff_jdep);
6056         return (aip);
6057 }
6058
6059 /*
6060  * Called just before setting an indirect block pointer
6061  * to a newly allocated file page.
6062  */
6063 void
6064 softdep_setup_allocindir_page(
6065         struct inode *ip,       /* inode for file being extended */
6066         ufs_lbn_t lbn,          /* allocated block number within file */
6067         struct buf *bp,         /* buffer with indirect blk referencing page */
6068         int ptrno,              /* offset of pointer in indirect block */
6069         ufs2_daddr_t newblkno,  /* disk block number being added */
6070         ufs2_daddr_t oldblkno,  /* previous block number, 0 if none */
6071         struct buf *nbp)        /* buffer holding allocated page */
6072 {
6073         struct inodedep *inodedep;
6074         struct freefrag *freefrag;
6075         struct allocindir *aip;
6076         struct pagedep *pagedep;
6077         struct mount *mp;
6078         struct ufsmount *ump;
6079
6080         mp = ITOVFS(ip);
6081         ump = VFSTOUFS(mp);
6082         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6083             ("softdep_setup_allocindir_page called on non-softdep filesystem"));
6084         KASSERT(lbn == nbp->b_lblkno,
6085             ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
6086             lbn, bp->b_lblkno));
6087         CTR4(KTR_SUJ,
6088             "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
6089             "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
6090         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
6091         aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
6092         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6093         /*
6094          * If we are allocating a directory page, then we must
6095          * allocate an associated pagedep to track additions and
6096          * deletions.
6097          */
6098         if ((ip->i_mode & IFMT) == IFDIR)
6099                 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
6100         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
6101         freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
6102         FREE_LOCK(ump);
6103         if (freefrag)
6104                 handle_workitem_freefrag(freefrag);
6105 }
6106
6107 /*
6108  * Called just before setting an indirect block pointer to a
6109  * newly allocated indirect block.
6110  */
6111 void
6112 softdep_setup_allocindir_meta(
6113         struct buf *nbp,        /* newly allocated indirect block */
6114         struct inode *ip,       /* inode for file being extended */
6115         struct buf *bp,         /* indirect block referencing allocated block */
6116         int ptrno,              /* offset of pointer in indirect block */
6117         ufs2_daddr_t newblkno)  /* disk block number being added */
6118 {
6119         struct inodedep *inodedep;
6120         struct allocindir *aip;
6121         struct ufsmount *ump;
6122         ufs_lbn_t lbn;
6123
6124         ump = ITOUMP(ip);
6125         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
6126             ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
6127         CTR3(KTR_SUJ,
6128             "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
6129             ip->i_number, newblkno, ptrno);
6130         lbn = nbp->b_lblkno;
6131         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
6132         aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
6133         inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
6134         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
6135         if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
6136                 panic("softdep_setup_allocindir_meta: Block already existed");
6137         FREE_LOCK(ump);
6138 }
6139
6140 static void
6141 indirdep_complete(struct indirdep *indirdep)
6142 {
6143         struct allocindir *aip;
6144
6145         LIST_REMOVE(indirdep, ir_next);
6146         indirdep->ir_state |= DEPCOMPLETE;
6147
6148         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
6149                 LIST_REMOVE(aip, ai_next);
6150                 free_newblk(&aip->ai_block);
6151         }
6152         /*
6153          * If this indirdep is not attached to a buf it was simply waiting
6154          * on completion to clear completehd.  free_indirdep() asserts
6155          * that nothing is dangling.
6156          */
6157         if ((indirdep->ir_state & ONWORKLIST) == 0)
6158                 free_indirdep(indirdep);
6159 }
6160
6161 static struct indirdep *
6162 indirdep_lookup(struct mount *mp,
6163         struct inode *ip,
6164         struct buf *bp)
6165 {
6166         struct indirdep *indirdep, *newindirdep;
6167         struct newblk *newblk;
6168         struct ufsmount *ump;
6169         struct worklist *wk;
6170         struct fs *fs;
6171         ufs2_daddr_t blkno;
6172
6173         ump = VFSTOUFS(mp);
6174         LOCK_OWNED(ump);
6175         indirdep = NULL;
6176         newindirdep = NULL;
6177         fs = ump->um_fs;
6178         for (;;) {
6179                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6180                         if (wk->wk_type != D_INDIRDEP)
6181                                 continue;
6182                         indirdep = WK_INDIRDEP(wk);
6183                         break;
6184                 }
6185                 /* Found on the buffer worklist, no new structure to free. */
6186                 if (indirdep != NULL && newindirdep == NULL)
6187                         return (indirdep);
6188                 if (indirdep != NULL && newindirdep != NULL)
6189                         panic("indirdep_lookup: simultaneous create");
6190                 /* None found on the buffer and a new structure is ready. */
6191                 if (indirdep == NULL && newindirdep != NULL)
6192                         break;
6193                 /* None found and no new structure available. */
6194                 FREE_LOCK(ump);
6195                 newindirdep = malloc(sizeof(struct indirdep),
6196                     M_INDIRDEP, M_SOFTDEP_FLAGS);
6197                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
6198                 newindirdep->ir_state = ATTACHED;
6199                 if (I_IS_UFS1(ip))
6200                         newindirdep->ir_state |= UFS1FMT;
6201                 TAILQ_INIT(&newindirdep->ir_trunc);
6202                 newindirdep->ir_saveddata = NULL;
6203                 LIST_INIT(&newindirdep->ir_deplisthd);
6204                 LIST_INIT(&newindirdep->ir_donehd);
6205                 LIST_INIT(&newindirdep->ir_writehd);
6206                 LIST_INIT(&newindirdep->ir_completehd);
6207                 if (bp->b_blkno == bp->b_lblkno) {
6208                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
6209                             NULL, NULL);
6210                         bp->b_blkno = blkno;
6211                 }
6212                 newindirdep->ir_freeblks = NULL;
6213                 newindirdep->ir_savebp =
6214                     getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
6215                 newindirdep->ir_bp = bp;
6216                 BUF_KERNPROC(newindirdep->ir_savebp);
6217                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
6218                 ACQUIRE_LOCK(ump);
6219         }
6220         indirdep = newindirdep;
6221         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
6222         /*
6223          * If the block is not yet allocated we don't set DEPCOMPLETE so
6224          * that we don't free dependencies until the pointers are valid.
6225          * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
6226          * than using the hash.
6227          */
6228         if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
6229                 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
6230         else
6231                 indirdep->ir_state |= DEPCOMPLETE;
6232         return (indirdep);
6233 }
6234
6235 /*
6236  * Called to finish the allocation of the "aip" allocated
6237  * by one of the two routines above.
6238  */
6239 static struct freefrag *
6240 setup_allocindir_phase2(
6241         struct buf *bp,         /* in-memory copy of the indirect block */
6242         struct inode *ip,       /* inode for file being extended */
6243         struct inodedep *inodedep, /* Inodedep for ip */
6244         struct allocindir *aip, /* allocindir allocated by the above routines */
6245         ufs_lbn_t lbn)          /* Logical block number for this block. */
6246 {
6247         struct fs *fs __diagused;
6248         struct indirdep *indirdep;
6249         struct allocindir *oldaip;
6250         struct freefrag *freefrag;
6251         struct mount *mp;
6252         struct ufsmount *ump;
6253
6254         mp = ITOVFS(ip);
6255         ump = VFSTOUFS(mp);
6256         LOCK_OWNED(ump);
6257         fs = ump->um_fs;
6258         if (bp->b_lblkno >= 0)
6259                 panic("setup_allocindir_phase2: not indir blk");
6260         KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6261             ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6262         indirdep = indirdep_lookup(mp, ip, bp);
6263         KASSERT(indirdep->ir_savebp != NULL,
6264             ("setup_allocindir_phase2 NULL ir_savebp"));
6265         aip->ai_indirdep = indirdep;
6266         /*
6267          * Check for an unwritten dependency for this indirect offset.  If
6268          * there is, merge the old dependency into the new one.  This happens
6269          * as a result of reallocblk only.
6270          */
6271         freefrag = NULL;
6272         if (aip->ai_oldblkno != 0) {
6273                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6274                         if (oldaip->ai_offset == aip->ai_offset) {
6275                                 freefrag = allocindir_merge(aip, oldaip);
6276                                 goto done;
6277                         }
6278                 }
6279                 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6280                         if (oldaip->ai_offset == aip->ai_offset) {
6281                                 freefrag = allocindir_merge(aip, oldaip);
6282                                 goto done;
6283                         }
6284                 }
6285         }
6286 done:
6287         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6288         return (freefrag);
6289 }
6290
6291 /*
6292  * Merge two allocindirs which refer to the same block.  Move newblock
6293  * dependencies and setup the freefrags appropriately.
6294  */
6295 static struct freefrag *
6296 allocindir_merge(
6297         struct allocindir *aip,
6298         struct allocindir *oldaip)
6299 {
6300         struct freefrag *freefrag;
6301         struct worklist *wk;
6302
6303         if (oldaip->ai_newblkno != aip->ai_oldblkno)
6304                 panic("allocindir_merge: blkno");
6305         aip->ai_oldblkno = oldaip->ai_oldblkno;
6306         freefrag = aip->ai_freefrag;
6307         aip->ai_freefrag = oldaip->ai_freefrag;
6308         oldaip->ai_freefrag = NULL;
6309         KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6310         /*
6311          * If we are tracking a new directory-block allocation,
6312          * move it from the old allocindir to the new allocindir.
6313          */
6314         if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6315                 WORKLIST_REMOVE(wk);
6316                 if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6317                         panic("allocindir_merge: extra newdirblk");
6318                 WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6319         }
6320         /*
6321          * We can skip journaling for this freefrag and just complete
6322          * any pending journal work for the allocindir that is being
6323          * removed after the freefrag completes.
6324          */
6325         if (freefrag->ff_jdep)
6326                 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6327         LIST_REMOVE(oldaip, ai_next);
6328         freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6329             &freefrag->ff_list, &freefrag->ff_jwork);
6330         free_newblk(&oldaip->ai_block);
6331
6332         return (freefrag);
6333 }
6334
6335 static inline void
6336 setup_freedirect(
6337         struct freeblks *freeblks,
6338         struct inode *ip,
6339         int i,
6340         int needj)
6341 {
6342         struct ufsmount *ump;
6343         ufs2_daddr_t blkno;
6344         int frags;
6345
6346         blkno = DIP(ip, i_db[i]);
6347         if (blkno == 0)
6348                 return;
6349         DIP_SET(ip, i_db[i], 0);
6350         ump = ITOUMP(ip);
6351         frags = sblksize(ump->um_fs, ip->i_size, i);
6352         frags = numfrags(ump->um_fs, frags);
6353         newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
6354 }
6355
6356 static inline void
6357 setup_freeext(
6358         struct freeblks *freeblks,
6359         struct inode *ip,
6360         int i,
6361         int needj)
6362 {
6363         struct ufsmount *ump;
6364         ufs2_daddr_t blkno;
6365         int frags;
6366
6367         blkno = ip->i_din2->di_extb[i];
6368         if (blkno == 0)
6369                 return;
6370         ip->i_din2->di_extb[i] = 0;
6371         ump = ITOUMP(ip);
6372         frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
6373         frags = numfrags(ump->um_fs, frags);
6374         newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6375 }
6376
6377 static inline void
6378 setup_freeindir(
6379         struct freeblks *freeblks,
6380         struct inode *ip,
6381         int i,
6382         ufs_lbn_t lbn,
6383         int needj)
6384 {
6385         struct ufsmount *ump;
6386         ufs2_daddr_t blkno;
6387
6388         blkno = DIP(ip, i_ib[i]);
6389         if (blkno == 0)
6390                 return;
6391         DIP_SET(ip, i_ib[i], 0);
6392         ump = ITOUMP(ip);
6393         newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
6394             0, needj);
6395 }
6396
6397 static inline struct freeblks *
6398 newfreeblks(struct mount *mp, struct inode *ip)
6399 {
6400         struct freeblks *freeblks;
6401
6402         freeblks = malloc(sizeof(struct freeblks),
6403                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6404         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6405         LIST_INIT(&freeblks->fb_jblkdephd);
6406         LIST_INIT(&freeblks->fb_jwork);
6407         freeblks->fb_ref = 0;
6408         freeblks->fb_cgwait = 0;
6409         freeblks->fb_state = ATTACHED;
6410         freeblks->fb_uid = ip->i_uid;
6411         freeblks->fb_inum = ip->i_number;
6412         freeblks->fb_vtype = ITOV(ip)->v_type;
6413         freeblks->fb_modrev = DIP(ip, i_modrev);
6414         freeblks->fb_devvp = ITODEVVP(ip);
6415         freeblks->fb_chkcnt = 0;
6416         freeblks->fb_len = 0;
6417
6418         return (freeblks);
6419 }
6420
6421 static void
6422 trunc_indirdep(
6423         struct indirdep *indirdep,
6424         struct freeblks *freeblks,
6425         struct buf *bp,
6426         int off)
6427 {
6428         struct allocindir *aip, *aipn;
6429
6430         /*
6431          * The first set of allocindirs won't be in savedbp.
6432          */
6433         LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6434                 if (aip->ai_offset > off)
6435                         cancel_allocindir(aip, bp, freeblks, 1);
6436         LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6437                 if (aip->ai_offset > off)
6438                         cancel_allocindir(aip, bp, freeblks, 1);
6439         /*
6440          * These will exist in savedbp.
6441          */
6442         LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6443                 if (aip->ai_offset > off)
6444                         cancel_allocindir(aip, NULL, freeblks, 0);
6445         LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6446                 if (aip->ai_offset > off)
6447                         cancel_allocindir(aip, NULL, freeblks, 0);
6448 }
6449
6450 /*
6451  * Follow the chain of indirects down to lastlbn creating a freework
6452  * structure for each.  This will be used to start indir_trunc() at
6453  * the right offset and create the journal records for the parrtial
6454  * truncation.  A second step will handle the truncated dependencies.
6455  */
6456 static int
6457 setup_trunc_indir(
6458         struct freeblks *freeblks,
6459         struct inode *ip,
6460         ufs_lbn_t lbn,
6461         ufs_lbn_t lastlbn,
6462         ufs2_daddr_t blkno)
6463 {
6464         struct indirdep *indirdep;
6465         struct indirdep *indirn;
6466         struct freework *freework;
6467         struct newblk *newblk;
6468         struct mount *mp;
6469         struct ufsmount *ump;
6470         struct buf *bp;
6471         uint8_t *start;
6472         uint8_t *end;
6473         ufs_lbn_t lbnadd;
6474         int level;
6475         int error;
6476         int off;
6477
6478         freework = NULL;
6479         if (blkno == 0)
6480                 return (0);
6481         mp = freeblks->fb_list.wk_mp;
6482         ump = VFSTOUFS(mp);
6483         /*
6484          * Here, calls to VOP_BMAP() will fail.  However, we already have
6485          * the on-disk address, so we just pass it to bread() instead of
6486          * having bread() attempt to calculate it using VOP_BMAP().
6487          */
6488         error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno),
6489             (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
6490         if (error)
6491                 return (error);
6492         level = lbn_level(lbn);
6493         lbnadd = lbn_offset(ump->um_fs, level);
6494         /*
6495          * Compute the offset of the last block we want to keep.  Store
6496          * in the freework the first block we want to completely free.
6497          */
6498         off = (lastlbn - -(lbn + level)) / lbnadd;
6499         if (off + 1 == NINDIR(ump->um_fs))
6500                 goto nowork;
6501         freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
6502         /*
6503          * Link the freework into the indirdep.  This will prevent any new
6504          * allocations from proceeding until we are finished with the
6505          * truncate and the block is written.
6506          */
6507         ACQUIRE_LOCK(ump);
6508         indirdep = indirdep_lookup(mp, ip, bp);
6509         if (indirdep->ir_freeblks)
6510                 panic("setup_trunc_indir: indirdep already truncated.");
6511         TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6512         freework->fw_indir = indirdep;
6513         /*
6514          * Cancel any allocindirs that will not make it to disk.
6515          * We have to do this for all copies of the indirdep that
6516          * live on this newblk.
6517          */
6518         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6519                 if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0,
6520                     &newblk) == 0)
6521                         panic("setup_trunc_indir: lost block");
6522                 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6523                         trunc_indirdep(indirn, freeblks, bp, off);
6524         } else
6525                 trunc_indirdep(indirdep, freeblks, bp, off);
6526         FREE_LOCK(ump);
6527         /*
6528          * Creation is protected by the buf lock. The saveddata is only
6529          * needed if a full truncation follows a partial truncation but it
6530          * is difficult to allocate in that case so we fetch it anyway.
6531          */
6532         if (indirdep->ir_saveddata == NULL)
6533                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6534                     M_SOFTDEP_FLAGS);
6535 nowork:
6536         /* Fetch the blkno of the child and the zero start offset. */
6537         if (I_IS_UFS1(ip)) {
6538                 blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6539                 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6540         } else {
6541                 blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6542                 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6543         }
6544         if (freework) {
6545                 /* Zero the truncated pointers. */
6546                 end = bp->b_data + bp->b_bcount;
6547                 bzero(start, end - start);
6548                 bdwrite(bp);
6549         } else
6550                 bqrelse(bp);
6551         if (level == 0)
6552                 return (0);
6553         lbn++; /* adjust level */
6554         lbn -= (off * lbnadd);
6555         return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6556 }
6557
6558 /*
6559  * Complete the partial truncation of an indirect block setup by
6560  * setup_trunc_indir().  This zeros the truncated pointers in the saved
6561  * copy and writes them to disk before the freeblks is allowed to complete.
6562  */
6563 static void
6564 complete_trunc_indir(struct freework *freework)
6565 {
6566         struct freework *fwn;
6567         struct indirdep *indirdep;
6568         struct ufsmount *ump;
6569         struct buf *bp;
6570         uintptr_t start;
6571         int count;
6572
6573         ump = VFSTOUFS(freework->fw_list.wk_mp);
6574         LOCK_OWNED(ump);
6575         indirdep = freework->fw_indir;
6576         for (;;) {
6577                 bp = indirdep->ir_bp;
6578                 /* See if the block was discarded. */
6579                 if (bp == NULL)
6580                         break;
6581                 /* Inline part of getdirtybuf().  We dont want bremfree. */
6582                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6583                         break;
6584                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6585                     LOCK_PTR(ump)) == 0)
6586                         BUF_UNLOCK(bp);
6587                 ACQUIRE_LOCK(ump);
6588         }
6589         freework->fw_state |= DEPCOMPLETE;
6590         TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6591         /*
6592          * Zero the pointers in the saved copy.
6593          */
6594         if (indirdep->ir_state & UFS1FMT)
6595                 start = sizeof(ufs1_daddr_t);
6596         else
6597                 start = sizeof(ufs2_daddr_t);
6598         start *= freework->fw_start;
6599         count = indirdep->ir_savebp->b_bcount - start;
6600         start += (uintptr_t)indirdep->ir_savebp->b_data;
6601         bzero((char *)start, count);
6602         /*
6603          * We need to start the next truncation in the list if it has not
6604          * been started yet.
6605          */
6606         fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6607         if (fwn != NULL) {
6608                 if (fwn->fw_freeblks == indirdep->ir_freeblks)
6609                         TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6610                 if ((fwn->fw_state & ONWORKLIST) == 0)
6611                         freework_enqueue(fwn);
6612         }
6613         /*
6614          * If bp is NULL the block was fully truncated, restore
6615          * the saved block list otherwise free it if it is no
6616          * longer needed.
6617          */
6618         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6619                 if (bp == NULL)
6620                         bcopy(indirdep->ir_saveddata,
6621                             indirdep->ir_savebp->b_data,
6622                             indirdep->ir_savebp->b_bcount);
6623                 free(indirdep->ir_saveddata, M_INDIRDEP);
6624                 indirdep->ir_saveddata = NULL;
6625         }
6626         /*
6627          * When bp is NULL there is a full truncation pending.  We
6628          * must wait for this full truncation to be journaled before
6629          * we can release this freework because the disk pointers will
6630          * never be written as zero.
6631          */
6632         if (bp == NULL)  {
6633                 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6634                         handle_written_freework(freework);
6635                 else
6636                         WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6637                            &freework->fw_list);
6638                 if (fwn == NULL) {
6639                         freework->fw_indir = (void *)0x0000deadbeef0000;
6640                         bp = indirdep->ir_savebp;
6641                         indirdep->ir_savebp = NULL;
6642                         free_indirdep(indirdep);
6643                         FREE_LOCK(ump);
6644                         brelse(bp);
6645                         ACQUIRE_LOCK(ump);
6646                 }
6647         } else {
6648                 /* Complete when the real copy is written. */
6649                 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6650                 BUF_UNLOCK(bp);
6651         }
6652 }
6653
6654 /*
6655  * Calculate the number of blocks we are going to release where datablocks
6656  * is the current total and length is the new file size.
6657  */
6658 static ufs2_daddr_t
6659 blkcount(struct fs *fs,
6660         ufs2_daddr_t datablocks,
6661         off_t length)
6662 {
6663         off_t totblks, numblks;
6664
6665         totblks = 0;
6666         numblks = howmany(length, fs->fs_bsize);
6667         if (numblks <= UFS_NDADDR) {
6668                 totblks = howmany(length, fs->fs_fsize);
6669                 goto out;
6670         }
6671         totblks = blkstofrags(fs, numblks);
6672         numblks -= UFS_NDADDR;
6673         /*
6674          * Count all single, then double, then triple indirects required.
6675          * Subtracting one indirects worth of blocks for each pass
6676          * acknowledges one of each pointed to by the inode.
6677          */
6678         for (;;) {
6679                 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6680                 numblks -= NINDIR(fs);
6681                 if (numblks <= 0)
6682                         break;
6683                 numblks = howmany(numblks, NINDIR(fs));
6684         }
6685 out:
6686         totblks = fsbtodb(fs, totblks);
6687         /*
6688          * Handle sparse files.  We can't reclaim more blocks than the inode
6689          * references.  We will correct it later in handle_complete_freeblks()
6690          * when we know the real count.
6691          */
6692         if (totblks > datablocks)
6693                 return (0);
6694         return (datablocks - totblks);
6695 }
6696
6697 /*
6698  * Handle freeblocks for journaled softupdate filesystems.
6699  *
6700  * Contrary to normal softupdates, we must preserve the block pointers in
6701  * indirects until their subordinates are free.  This is to avoid journaling
6702  * every block that is freed which may consume more space than the journal
6703  * itself.  The recovery program will see the free block journals at the
6704  * base of the truncated area and traverse them to reclaim space.  The
6705  * pointers in the inode may be cleared immediately after the journal
6706  * records are written because each direct and indirect pointer in the
6707  * inode is recorded in a journal.  This permits full truncation to proceed
6708  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6709  *
6710  * The algorithm is as follows:
6711  * 1) Traverse the in-memory state and create journal entries to release
6712  *    the relevant blocks and full indirect trees.
6713  * 2) Traverse the indirect block chain adding partial truncation freework
6714  *    records to indirects in the path to lastlbn.  The freework will
6715  *    prevent new allocation dependencies from being satisfied in this
6716  *    indirect until the truncation completes.
6717  * 3) Read and lock the inode block, performing an update with the new size
6718  *    and pointers.  This prevents truncated data from becoming valid on
6719  *    disk through step 4.
6720  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6721  *    eliminate journal work for those records that do not require it.
6722  * 5) Schedule the journal records to be written followed by the inode block.
6723  * 6) Allocate any necessary frags for the end of file.
6724  * 7) Zero any partially truncated blocks.
6725  *
6726  * From this truncation proceeds asynchronously using the freework and
6727  * indir_trunc machinery.  The file will not be extended again into a
6728  * partially truncated indirect block until all work is completed but
6729  * the normal dependency mechanism ensures that it is rolled back/forward
6730  * as appropriate.  Further truncation may occur without delay and is
6731  * serialized in indir_trunc().
6732  */
6733 void
6734 softdep_journal_freeblocks(
6735         struct inode *ip,       /* The inode whose length is to be reduced */
6736         struct ucred *cred,
6737         off_t length,           /* The new length for the file */
6738         int flags)              /* IO_EXT and/or IO_NORMAL */
6739 {
6740         struct freeblks *freeblks, *fbn;
6741         struct worklist *wk, *wkn;
6742         struct inodedep *inodedep;
6743         struct jblkdep *jblkdep;
6744         struct allocdirect *adp, *adpn;
6745         struct ufsmount *ump;
6746         struct fs *fs;
6747         struct buf *bp;
6748         struct vnode *vp;
6749         struct mount *mp;
6750         daddr_t dbn;
6751         ufs2_daddr_t extblocks, datablocks;
6752         ufs_lbn_t tmpval, lbn, lastlbn;
6753         int frags, lastoff, iboff, allocblock, needj, error, i;
6754
6755         ump = ITOUMP(ip);
6756         mp = UFSTOVFS(ump);
6757         fs = ump->um_fs;
6758         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6759             ("softdep_journal_freeblocks called on non-softdep filesystem"));
6760         vp = ITOV(ip);
6761         needj = 1;
6762         iboff = -1;
6763         allocblock = 0;
6764         extblocks = 0;
6765         datablocks = 0;
6766         frags = 0;
6767         freeblks = newfreeblks(mp, ip);
6768         ACQUIRE_LOCK(ump);
6769         /*
6770          * If we're truncating a removed file that will never be written
6771          * we don't need to journal the block frees.  The canceled journals
6772          * for the allocations will suffice.
6773          */
6774         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6775         if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6776             length == 0)
6777                 needj = 0;
6778         CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6779             ip->i_number, length, needj);
6780         FREE_LOCK(ump);
6781         /*
6782          * Calculate the lbn that we are truncating to.  This results in -1
6783          * if we're truncating the 0 bytes.  So it is the last lbn we want
6784          * to keep, not the first lbn we want to truncate.
6785          */
6786         lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6787         lastoff = blkoff(fs, length);
6788         /*
6789          * Compute frags we are keeping in lastlbn.  0 means all.
6790          */
6791         if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
6792                 frags = fragroundup(fs, lastoff);
6793                 /* adp offset of last valid allocdirect. */
6794                 iboff = lastlbn;
6795         } else if (lastlbn > 0)
6796                 iboff = UFS_NDADDR;
6797         if (fs->fs_magic == FS_UFS2_MAGIC)
6798                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6799         /*
6800          * Handle normal data blocks and indirects.  This section saves
6801          * values used after the inode update to complete frag and indirect
6802          * truncation.
6803          */
6804         if ((flags & IO_NORMAL) != 0) {
6805                 /*
6806                  * Handle truncation of whole direct and indirect blocks.
6807                  */
6808                 for (i = iboff + 1; i < UFS_NDADDR; i++)
6809                         setup_freedirect(freeblks, ip, i, needj);
6810                 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6811                     i < UFS_NIADDR;
6812                     i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6813                         /* Release a whole indirect tree. */
6814                         if (lbn > lastlbn) {
6815                                 setup_freeindir(freeblks, ip, i, -lbn -i,
6816                                     needj);
6817                                 continue;
6818                         }
6819                         iboff = i + UFS_NDADDR;
6820                         /*
6821                          * Traverse partially truncated indirect tree.
6822                          */
6823                         if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6824                                 setup_trunc_indir(freeblks, ip, -lbn - i,
6825                                     lastlbn, DIP(ip, i_ib[i]));
6826                 }
6827                 /*
6828                  * Handle partial truncation to a frag boundary.
6829                  */
6830                 if (frags) {
6831                         ufs2_daddr_t blkno;
6832                         long oldfrags;
6833
6834                         oldfrags = blksize(fs, ip, lastlbn);
6835                         blkno = DIP(ip, i_db[lastlbn]);
6836                         if (blkno && oldfrags != frags) {
6837                                 oldfrags -= frags;
6838                                 oldfrags = numfrags(fs, oldfrags);
6839                                 blkno += numfrags(fs, frags);
6840                                 newfreework(ump, freeblks, NULL, lastlbn,
6841                                     blkno, oldfrags, 0, needj);
6842                                 if (needj)
6843                                         adjust_newfreework(freeblks,
6844                                             numfrags(fs, frags));
6845                         } else if (blkno == 0)
6846                                 allocblock = 1;
6847                 }
6848                 /*
6849                  * Add a journal record for partial truncate if we are
6850                  * handling indirect blocks.  Non-indirects need no extra
6851                  * journaling.
6852                  */
6853                 if (length != 0 && lastlbn >= UFS_NDADDR) {
6854                         UFS_INODE_SET_FLAG(ip, IN_TRUNCATED);
6855                         newjtrunc(freeblks, length, 0);
6856                 }
6857                 ip->i_size = length;
6858                 DIP_SET(ip, i_size, ip->i_size);
6859                 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
6860                 datablocks = DIP(ip, i_blocks) - extblocks;
6861                 if (length != 0)
6862                         datablocks = blkcount(fs, datablocks, length);
6863                 freeblks->fb_len = length;
6864         }
6865         if ((flags & IO_EXT) != 0) {
6866                 for (i = 0; i < UFS_NXADDR; i++)
6867                         setup_freeext(freeblks, ip, i, needj);
6868                 ip->i_din2->di_extsize = 0;
6869                 datablocks += extblocks;
6870                 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
6871         }
6872 #ifdef QUOTA
6873         /* Reference the quotas in case the block count is wrong in the end. */
6874         quotaref(vp, freeblks->fb_quota);
6875         (void) chkdq(ip, -datablocks, NOCRED, FORCE);
6876 #endif
6877         freeblks->fb_chkcnt = -datablocks;
6878         UFS_LOCK(ump);
6879         fs->fs_pendingblocks += datablocks;
6880         UFS_UNLOCK(ump);
6881         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6882         /*
6883          * Handle truncation of incomplete alloc direct dependencies.  We
6884          * hold the inode block locked to prevent incomplete dependencies
6885          * from reaching the disk while we are eliminating those that
6886          * have been truncated.  This is a partially inlined ffs_update().
6887          */
6888         ufs_itimes(vp);
6889         ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6890         dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number));
6891         error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize,
6892             NULL, NULL, 0, cred, 0, NULL, &bp);
6893         if (error) {
6894                 softdep_error("softdep_journal_freeblocks", error);
6895                 return;
6896         }
6897         if (bp->b_bufsize == fs->fs_bsize)
6898                 bp->b_flags |= B_CLUSTEROK;
6899         softdep_update_inodeblock(ip, bp, 0);
6900         if (ump->um_fstype == UFS1) {
6901                 *((struct ufs1_dinode *)bp->b_data +
6902                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6903         } else {
6904                 ffs_update_dinode_ckhash(fs, ip->i_din2);
6905                 *((struct ufs2_dinode *)bp->b_data +
6906                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6907         }
6908         ACQUIRE_LOCK(ump);
6909         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6910         if ((inodedep->id_state & IOSTARTED) != 0)
6911                 panic("softdep_setup_freeblocks: inode busy");
6912         /*
6913          * Add the freeblks structure to the list of operations that
6914          * must await the zero'ed inode being written to disk. If we
6915          * still have a bitmap dependency (needj), then the inode
6916          * has never been written to disk, so we can process the
6917          * freeblks below once we have deleted the dependencies.
6918          */
6919         if (needj)
6920                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6921         else
6922                 freeblks->fb_state |= COMPLETE;
6923         if ((flags & IO_NORMAL) != 0) {
6924                 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6925                         if (adp->ad_offset > iboff)
6926                                 cancel_allocdirect(&inodedep->id_inoupdt, adp,
6927                                     freeblks);
6928                         /*
6929                          * Truncate the allocdirect.  We could eliminate
6930                          * or modify journal records as well.
6931                          */
6932                         else if (adp->ad_offset == iboff && frags)
6933                                 adp->ad_newsize = frags;
6934                 }
6935         }
6936         if ((flags & IO_EXT) != 0)
6937                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6938                         cancel_allocdirect(&inodedep->id_extupdt, adp,
6939                             freeblks);
6940         /*
6941          * Scan the bufwait list for newblock dependencies that will never
6942          * make it to disk.
6943          */
6944         LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6945                 if (wk->wk_type != D_ALLOCDIRECT)
6946                         continue;
6947                 adp = WK_ALLOCDIRECT(wk);
6948                 if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6949                     ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6950                         cancel_jfreeblk(freeblks, adp->ad_newblkno);
6951                         cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6952                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6953                 }
6954         }
6955         /*
6956          * Add journal work.
6957          */
6958         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6959                 add_to_journal(&jblkdep->jb_list);
6960         FREE_LOCK(ump);
6961         bdwrite(bp);
6962         /*
6963          * Truncate dependency structures beyond length.
6964          */
6965         trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6966         /*
6967          * This is only set when we need to allocate a fragment because
6968          * none existed at the end of a frag-sized file.  It handles only
6969          * allocating a new, zero filled block.
6970          */
6971         if (allocblock) {
6972                 ip->i_size = length - lastoff;
6973                 DIP_SET(ip, i_size, ip->i_size);
6974                 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6975                 if (error != 0) {
6976                         softdep_error("softdep_journal_freeblks", error);
6977                         return;
6978                 }
6979                 ip->i_size = length;
6980                 DIP_SET(ip, i_size, length);
6981                 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE);
6982                 allocbuf(bp, frags);
6983                 ffs_update(vp, 0);
6984                 bawrite(bp);
6985         } else if (lastoff != 0 && vp->v_type != VDIR) {
6986                 int size;
6987
6988                 /*
6989                  * Zero the end of a truncated frag or block.
6990                  */
6991                 size = sblksize(fs, length, lastlbn);
6992                 error = bread(vp, lastlbn, size, cred, &bp);
6993                 if (error == 0) {
6994                         bzero((char *)bp->b_data + lastoff, size - lastoff);
6995                         bawrite(bp);
6996                 } else if (!ffs_fsfail_cleanup(ump, error)) {
6997                         softdep_error("softdep_journal_freeblks", error);
6998                         return;
6999                 }
7000         }
7001         ACQUIRE_LOCK(ump);
7002         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7003         TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
7004         freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
7005         /*
7006          * We zero earlier truncations so they don't erroneously
7007          * update i_blocks.
7008          */
7009         if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
7010                 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
7011                         fbn->fb_len = 0;
7012         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
7013             LIST_EMPTY(&freeblks->fb_jblkdephd))
7014                 freeblks->fb_state |= INPROGRESS;
7015         else
7016                 freeblks = NULL;
7017         FREE_LOCK(ump);
7018         if (freeblks)
7019                 handle_workitem_freeblocks(freeblks, 0);
7020         trunc_pages(ip, length, extblocks, flags);
7021
7022 }
7023
7024 /*
7025  * Flush a JOP_SYNC to the journal.
7026  */
7027 void
7028 softdep_journal_fsync(struct inode *ip)
7029 {
7030         struct jfsync *jfsync;
7031         struct ufsmount *ump;
7032
7033         ump = ITOUMP(ip);
7034         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7035             ("softdep_journal_fsync called on non-softdep filesystem"));
7036         if ((ip->i_flag & IN_TRUNCATED) == 0)
7037                 return;
7038         ip->i_flag &= ~IN_TRUNCATED;
7039         jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
7040         workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
7041         jfsync->jfs_size = ip->i_size;
7042         jfsync->jfs_ino = ip->i_number;
7043         ACQUIRE_LOCK(ump);
7044         add_to_journal(&jfsync->jfs_list);
7045         jwait(&jfsync->jfs_list, MNT_WAIT);
7046         FREE_LOCK(ump);
7047 }
7048
7049 /*
7050  * Block de-allocation dependencies.
7051  *
7052  * When blocks are de-allocated, the on-disk pointers must be nullified before
7053  * the blocks are made available for use by other files.  (The true
7054  * requirement is that old pointers must be nullified before new on-disk
7055  * pointers are set.  We chose this slightly more stringent requirement to
7056  * reduce complexity.) Our implementation handles this dependency by updating
7057  * the inode (or indirect block) appropriately but delaying the actual block
7058  * de-allocation (i.e., freemap and free space count manipulation) until
7059  * after the updated versions reach stable storage.  After the disk is
7060  * updated, the blocks can be safely de-allocated whenever it is convenient.
7061  * This implementation handles only the common case of reducing a file's
7062  * length to zero. Other cases are handled by the conventional synchronous
7063  * write approach.
7064  *
7065  * The ffs implementation with which we worked double-checks
7066  * the state of the block pointers and file size as it reduces
7067  * a file's length.  Some of this code is replicated here in our
7068  * soft updates implementation.  The freeblks->fb_chkcnt field is
7069  * used to transfer a part of this information to the procedure
7070  * that eventually de-allocates the blocks.
7071  *
7072  * This routine should be called from the routine that shortens
7073  * a file's length, before the inode's size or block pointers
7074  * are modified. It will save the block pointer information for
7075  * later release and zero the inode so that the calling routine
7076  * can release it.
7077  */
7078 void
7079 softdep_setup_freeblocks(
7080         struct inode *ip,       /* The inode whose length is to be reduced */
7081         off_t length,           /* The new length for the file */
7082         int flags)              /* IO_EXT and/or IO_NORMAL */
7083 {
7084         struct ufs1_dinode *dp1;
7085         struct ufs2_dinode *dp2;
7086         struct freeblks *freeblks;
7087         struct inodedep *inodedep;
7088         struct allocdirect *adp;
7089         struct ufsmount *ump;
7090         struct buf *bp;
7091         struct fs *fs;
7092         ufs2_daddr_t extblocks, datablocks;
7093         struct mount *mp;
7094         int i, delay, error;
7095         ufs_lbn_t tmpval;
7096         ufs_lbn_t lbn;
7097
7098         ump = ITOUMP(ip);
7099         mp = UFSTOVFS(ump);
7100         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
7101             ("softdep_setup_freeblocks called on non-softdep filesystem"));
7102         CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
7103             ip->i_number, length);
7104         KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
7105         fs = ump->um_fs;
7106         if ((error = bread(ump->um_devvp,
7107             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
7108             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
7109                 if (!ffs_fsfail_cleanup(ump, error))
7110                         softdep_error("softdep_setup_freeblocks", error);
7111                 return;
7112         }
7113         freeblks = newfreeblks(mp, ip);
7114         extblocks = 0;
7115         datablocks = 0;
7116         if (fs->fs_magic == FS_UFS2_MAGIC)
7117                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
7118         if ((flags & IO_NORMAL) != 0) {
7119                 for (i = 0; i < UFS_NDADDR; i++)
7120                         setup_freedirect(freeblks, ip, i, 0);
7121                 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
7122                     i < UFS_NIADDR;
7123                     i++, lbn += tmpval, tmpval *= NINDIR(fs))
7124                         setup_freeindir(freeblks, ip, i, -lbn -i, 0);
7125                 ip->i_size = 0;
7126                 DIP_SET(ip, i_size, 0);
7127                 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7128                 datablocks = DIP(ip, i_blocks) - extblocks;
7129         }
7130         if ((flags & IO_EXT) != 0) {
7131                 for (i = 0; i < UFS_NXADDR; i++)
7132                         setup_freeext(freeblks, ip, i, 0);
7133                 ip->i_din2->di_extsize = 0;
7134                 datablocks += extblocks;
7135                 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7136         }
7137 #ifdef QUOTA
7138         /* Reference the quotas in case the block count is wrong in the end. */
7139         quotaref(ITOV(ip), freeblks->fb_quota);
7140         (void) chkdq(ip, -datablocks, NOCRED, FORCE);
7141 #endif
7142         freeblks->fb_chkcnt = -datablocks;
7143         UFS_LOCK(ump);
7144         fs->fs_pendingblocks += datablocks;
7145         UFS_UNLOCK(ump);
7146         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
7147         /*
7148          * Push the zero'ed inode to its disk buffer so that we are free
7149          * to delete its dependencies below. Once the dependencies are gone
7150          * the buffer can be safely released.
7151          */
7152         if (ump->um_fstype == UFS1) {
7153                 dp1 = ((struct ufs1_dinode *)bp->b_data +
7154                     ino_to_fsbo(fs, ip->i_number));
7155                 ip->i_din1->di_freelink = dp1->di_freelink;
7156                 *dp1 = *ip->i_din1;
7157         } else {
7158                 dp2 = ((struct ufs2_dinode *)bp->b_data +
7159                     ino_to_fsbo(fs, ip->i_number));
7160                 ip->i_din2->di_freelink = dp2->di_freelink;
7161                 ffs_update_dinode_ckhash(fs, ip->i_din2);
7162                 *dp2 = *ip->i_din2;
7163         }
7164         /*
7165          * Find and eliminate any inode dependencies.
7166          */
7167         ACQUIRE_LOCK(ump);
7168         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7169         if ((inodedep->id_state & IOSTARTED) != 0)
7170                 panic("softdep_setup_freeblocks: inode busy");
7171         /*
7172          * Add the freeblks structure to the list of operations that
7173          * must await the zero'ed inode being written to disk. If we
7174          * still have a bitmap dependency (delay == 0), then the inode
7175          * has never been written to disk, so we can process the
7176          * freeblks below once we have deleted the dependencies.
7177          */
7178         delay = (inodedep->id_state & DEPCOMPLETE);
7179         if (delay)
7180                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
7181         else
7182                 freeblks->fb_state |= COMPLETE;
7183         /*
7184          * Because the file length has been truncated to zero, any
7185          * pending block allocation dependency structures associated
7186          * with this inode are obsolete and can simply be de-allocated.
7187          * We must first merge the two dependency lists to get rid of
7188          * any duplicate freefrag structures, then purge the merged list.
7189          * If we still have a bitmap dependency, then the inode has never
7190          * been written to disk, so we can free any fragments without delay.
7191          */
7192         if (flags & IO_NORMAL) {
7193                 merge_inode_lists(&inodedep->id_newinoupdt,
7194                     &inodedep->id_inoupdt);
7195                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
7196                         cancel_allocdirect(&inodedep->id_inoupdt, adp,
7197                             freeblks);
7198         }
7199         if (flags & IO_EXT) {
7200                 merge_inode_lists(&inodedep->id_newextupdt,
7201                     &inodedep->id_extupdt);
7202                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
7203                         cancel_allocdirect(&inodedep->id_extupdt, adp,
7204                             freeblks);
7205         }
7206         FREE_LOCK(ump);
7207         bdwrite(bp);
7208         trunc_dependencies(ip, freeblks, -1, 0, flags);
7209         ACQUIRE_LOCK(ump);
7210         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
7211                 (void) free_inodedep(inodedep);
7212         freeblks->fb_state |= DEPCOMPLETE;
7213         /*
7214          * If the inode with zeroed block pointers is now on disk
7215          * we can start freeing blocks.
7216          */
7217         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
7218                 freeblks->fb_state |= INPROGRESS;
7219         else
7220                 freeblks = NULL;
7221         FREE_LOCK(ump);
7222         if (freeblks)
7223                 handle_workitem_freeblocks(freeblks, 0);
7224         trunc_pages(ip, length, extblocks, flags);
7225 }
7226
7227 /*
7228  * Eliminate pages from the page cache that back parts of this inode and
7229  * adjust the vnode pager's idea of our size.  This prevents stale data
7230  * from hanging around in the page cache.
7231  */
7232 static void
7233 trunc_pages(
7234         struct inode *ip,
7235         off_t length,
7236         ufs2_daddr_t extblocks,
7237         int flags)
7238 {
7239         struct vnode *vp;
7240         struct fs *fs;
7241         ufs_lbn_t lbn;
7242         off_t end, extend;
7243
7244         vp = ITOV(ip);
7245         fs = ITOFS(ip);
7246         extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
7247         if ((flags & IO_EXT) != 0)
7248                 vn_pages_remove(vp, extend, 0);
7249         if ((flags & IO_NORMAL) == 0)
7250                 return;
7251         BO_LOCK(&vp->v_bufobj);
7252         drain_output(vp);
7253         BO_UNLOCK(&vp->v_bufobj);
7254         /*
7255          * The vnode pager eliminates file pages we eliminate indirects
7256          * below.
7257          */
7258         vnode_pager_setsize(vp, length);
7259         /*
7260          * Calculate the end based on the last indirect we want to keep.  If
7261          * the block extends into indirects we can just use the negative of
7262          * its lbn.  Doubles and triples exist at lower numbers so we must
7263          * be careful not to remove those, if they exist.  double and triple
7264          * indirect lbns do not overlap with others so it is not important
7265          * to verify how many levels are required.
7266          */
7267         lbn = lblkno(fs, length);
7268         if (lbn >= UFS_NDADDR) {
7269                 /* Calculate the virtual lbn of the triple indirect. */
7270                 lbn = -lbn - (UFS_NIADDR - 1);
7271                 end = OFF_TO_IDX(lblktosize(fs, lbn));
7272         } else
7273                 end = extend;
7274         vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7275 }
7276
7277 /*
7278  * See if the buf bp is in the range eliminated by truncation.
7279  */
7280 static int
7281 trunc_check_buf(
7282         struct buf *bp,
7283         int *blkoffp,
7284         ufs_lbn_t lastlbn,
7285         int lastoff,
7286         int flags)
7287 {
7288         ufs_lbn_t lbn;
7289
7290         *blkoffp = 0;
7291         /* Only match ext/normal blocks as appropriate. */
7292         if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7293             ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7294                 return (0);
7295         /* ALTDATA is always a full truncation. */
7296         if ((bp->b_xflags & BX_ALTDATA) != 0)
7297                 return (1);
7298         /* -1 is full truncation. */
7299         if (lastlbn == -1)
7300                 return (1);
7301         /*
7302          * If this is a partial truncate we only want those
7303          * blocks and indirect blocks that cover the range
7304          * we're after.
7305          */
7306         lbn = bp->b_lblkno;
7307         if (lbn < 0)
7308                 lbn = -(lbn + lbn_level(lbn));
7309         if (lbn < lastlbn)
7310                 return (0);
7311         /* Here we only truncate lblkno if it's partial. */
7312         if (lbn == lastlbn) {
7313                 if (lastoff == 0)
7314                         return (0);
7315                 *blkoffp = lastoff;
7316         }
7317         return (1);
7318 }
7319
7320 /*
7321  * Eliminate any dependencies that exist in memory beyond lblkno:off
7322  */
7323 static void
7324 trunc_dependencies(
7325         struct inode *ip,
7326         struct freeblks *freeblks,
7327         ufs_lbn_t lastlbn,
7328         int lastoff,
7329         int flags)
7330 {
7331         struct bufobj *bo;
7332         struct vnode *vp;
7333         struct buf *bp;
7334         int blkoff;
7335
7336         /*
7337          * We must wait for any I/O in progress to finish so that
7338          * all potential buffers on the dirty list will be visible.
7339          * Once they are all there, walk the list and get rid of
7340          * any dependencies.
7341          */
7342         vp = ITOV(ip);
7343         bo = &vp->v_bufobj;
7344         BO_LOCK(bo);
7345         drain_output(vp);
7346         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7347                 bp->b_vflags &= ~BV_SCANNED;
7348 restart:
7349         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7350                 if (bp->b_vflags & BV_SCANNED)
7351                         continue;
7352                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7353                         bp->b_vflags |= BV_SCANNED;
7354                         continue;
7355                 }
7356                 KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7357                 if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7358                         goto restart;
7359                 BO_UNLOCK(bo);
7360                 if (deallocate_dependencies(bp, freeblks, blkoff))
7361                         bqrelse(bp);
7362                 else
7363                         brelse(bp);
7364                 BO_LOCK(bo);
7365                 goto restart;
7366         }
7367         /*
7368          * Now do the work of vtruncbuf while also matching indirect blocks.
7369          */
7370         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7371                 bp->b_vflags &= ~BV_SCANNED;
7372 cleanrestart:
7373         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7374                 if (bp->b_vflags & BV_SCANNED)
7375                         continue;
7376                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7377                         bp->b_vflags |= BV_SCANNED;
7378                         continue;
7379                 }
7380                 if (BUF_LOCK(bp,
7381                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7382                     BO_LOCKPTR(bo)) == ENOLCK) {
7383                         BO_LOCK(bo);
7384                         goto cleanrestart;
7385                 }
7386                 BO_LOCK(bo);
7387                 bp->b_vflags |= BV_SCANNED;
7388                 BO_UNLOCK(bo);
7389                 bremfree(bp);
7390                 if (blkoff != 0) {
7391                         allocbuf(bp, blkoff);
7392                         bqrelse(bp);
7393                 } else {
7394                         bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7395                         brelse(bp);
7396                 }
7397                 BO_LOCK(bo);
7398                 goto cleanrestart;
7399         }
7400         drain_output(vp);
7401         BO_UNLOCK(bo);
7402 }
7403
7404 static int
7405 cancel_pagedep(
7406         struct pagedep *pagedep,
7407         struct freeblks *freeblks,
7408         int blkoff)
7409 {
7410         struct jremref *jremref;
7411         struct jmvref *jmvref;
7412         struct dirrem *dirrem, *tmp;
7413         int i;
7414
7415         /*
7416          * Copy any directory remove dependencies to the list
7417          * to be processed after the freeblks proceeds.  If
7418          * directory entry never made it to disk they
7419          * can be dumped directly onto the work list.
7420          */
7421         LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7422                 /* Skip this directory removal if it is intended to remain. */
7423                 if (dirrem->dm_offset < blkoff)
7424                         continue;
7425                 /*
7426                  * If there are any dirrems we wait for the journal write
7427                  * to complete and then restart the buf scan as the lock
7428                  * has been dropped.
7429                  */
7430                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7431                         jwait(&jremref->jr_list, MNT_WAIT);
7432                         return (ERESTART);
7433                 }
7434                 LIST_REMOVE(dirrem, dm_next);
7435                 dirrem->dm_dirinum = pagedep->pd_ino;
7436                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7437         }
7438         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7439                 jwait(&jmvref->jm_list, MNT_WAIT);
7440                 return (ERESTART);
7441         }
7442         /*
7443          * When we're partially truncating a pagedep we just want to flush
7444          * journal entries and return.  There can not be any adds in the
7445          * truncated portion of the directory and newblk must remain if
7446          * part of the block remains.
7447          */
7448         if (blkoff != 0) {
7449                 struct diradd *dap;
7450
7451                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7452                         if (dap->da_offset > blkoff)
7453                                 panic("cancel_pagedep: diradd %p off %d > %d",
7454                                     dap, dap->da_offset, blkoff);
7455                 for (i = 0; i < DAHASHSZ; i++)
7456                         LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7457                                 if (dap->da_offset > blkoff)
7458                                         panic("cancel_pagedep: diradd %p off %d > %d",
7459                                             dap, dap->da_offset, blkoff);
7460                 return (0);
7461         }
7462         /*
7463          * There should be no directory add dependencies present
7464          * as the directory could not be truncated until all
7465          * children were removed.
7466          */
7467         KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7468             ("deallocate_dependencies: pendinghd != NULL"));
7469         for (i = 0; i < DAHASHSZ; i++)
7470                 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7471                     ("deallocate_dependencies: diraddhd != NULL"));
7472         if ((pagedep->pd_state & NEWBLOCK) != 0)
7473                 free_newdirblk(pagedep->pd_newdirblk);
7474         if (free_pagedep(pagedep) == 0)
7475                 panic("Failed to free pagedep %p", pagedep);
7476         return (0);
7477 }
7478
7479 /*
7480  * Reclaim any dependency structures from a buffer that is about to
7481  * be reallocated to a new vnode. The buffer must be locked, thus,
7482  * no I/O completion operations can occur while we are manipulating
7483  * its associated dependencies. The mutex is held so that other I/O's
7484  * associated with related dependencies do not occur.
7485  */
7486 static int
7487 deallocate_dependencies(
7488         struct buf *bp,
7489         struct freeblks *freeblks,
7490         int off)
7491 {
7492         struct indirdep *indirdep;
7493         struct pagedep *pagedep;
7494         struct worklist *wk, *wkn;
7495         struct ufsmount *ump;
7496
7497         ump = softdep_bp_to_mp(bp);
7498         if (ump == NULL)
7499                 goto done;
7500         ACQUIRE_LOCK(ump);
7501         LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7502                 switch (wk->wk_type) {
7503                 case D_INDIRDEP:
7504                         indirdep = WK_INDIRDEP(wk);
7505                         if (bp->b_lblkno >= 0 ||
7506                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7507                                 panic("deallocate_dependencies: not indir");
7508                         cancel_indirdep(indirdep, bp, freeblks);
7509                         continue;
7510
7511                 case D_PAGEDEP:
7512                         pagedep = WK_PAGEDEP(wk);
7513                         if (cancel_pagedep(pagedep, freeblks, off)) {
7514                                 FREE_LOCK(ump);
7515                                 return (ERESTART);
7516                         }
7517                         continue;
7518
7519                 case D_ALLOCINDIR:
7520                         /*
7521                          * Simply remove the allocindir, we'll find it via
7522                          * the indirdep where we can clear pointers if
7523                          * needed.
7524                          */
7525                         WORKLIST_REMOVE(wk);
7526                         continue;
7527
7528                 case D_FREEWORK:
7529                         /*
7530                          * A truncation is waiting for the zero'd pointers
7531                          * to be written.  It can be freed when the freeblks
7532                          * is journaled.
7533                          */
7534                         WORKLIST_REMOVE(wk);
7535                         wk->wk_state |= ONDEPLIST;
7536                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7537                         break;
7538
7539                 case D_ALLOCDIRECT:
7540                         if (off != 0)
7541                                 continue;
7542                         /* FALLTHROUGH */
7543                 default:
7544                         panic("deallocate_dependencies: Unexpected type %s",
7545                             TYPENAME(wk->wk_type));
7546                         /* NOTREACHED */
7547                 }
7548         }
7549         FREE_LOCK(ump);
7550 done:
7551         /*
7552          * Don't throw away this buf, we were partially truncating and
7553          * some deps may always remain.
7554          */
7555         if (off) {
7556                 allocbuf(bp, off);
7557                 bp->b_vflags |= BV_SCANNED;
7558                 return (EBUSY);
7559         }
7560         bp->b_flags |= B_INVAL | B_NOCACHE;
7561
7562         return (0);
7563 }
7564
7565 /*
7566  * An allocdirect is being canceled due to a truncate.  We must make sure
7567  * the journal entry is released in concert with the blkfree that releases
7568  * the storage.  Completed journal entries must not be released until the
7569  * space is no longer pointed to by the inode or in the bitmap.
7570  */
7571 static void
7572 cancel_allocdirect(
7573         struct allocdirectlst *adphead,
7574         struct allocdirect *adp,
7575         struct freeblks *freeblks)
7576 {
7577         struct freework *freework;
7578         struct newblk *newblk;
7579         struct worklist *wk;
7580
7581         TAILQ_REMOVE(adphead, adp, ad_next);
7582         newblk = (struct newblk *)adp;
7583         freework = NULL;
7584         /*
7585          * Find the correct freework structure.
7586          */
7587         LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7588                 if (wk->wk_type != D_FREEWORK)
7589                         continue;
7590                 freework = WK_FREEWORK(wk);
7591                 if (freework->fw_blkno == newblk->nb_newblkno)
7592                         break;
7593         }
7594         if (freework == NULL)
7595                 panic("cancel_allocdirect: Freework not found");
7596         /*
7597          * If a newblk exists at all we still have the journal entry that
7598          * initiated the allocation so we do not need to journal the free.
7599          */
7600         cancel_jfreeblk(freeblks, freework->fw_blkno);
7601         /*
7602          * If the journal hasn't been written the jnewblk must be passed
7603          * to the call to ffs_blkfree that reclaims the space.  We accomplish
7604          * this by linking the journal dependency into the freework to be
7605          * freed when freework_freeblock() is called.  If the journal has
7606          * been written we can simply reclaim the journal space when the
7607          * freeblks work is complete.
7608          */
7609         freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7610             &freeblks->fb_jwork);
7611         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7612 }
7613
7614 /*
7615  * Cancel a new block allocation.  May be an indirect or direct block.  We
7616  * remove it from various lists and return any journal record that needs to
7617  * be resolved by the caller.
7618  *
7619  * A special consideration is made for indirects which were never pointed
7620  * at on disk and will never be found once this block is released.
7621  */
7622 static struct jnewblk *
7623 cancel_newblk(
7624         struct newblk *newblk,
7625         struct worklist *wk,
7626         struct workhead *wkhd)
7627 {
7628         struct jnewblk *jnewblk;
7629
7630         CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7631
7632         newblk->nb_state |= GOINGAWAY;
7633         /*
7634          * Previously we traversed the completedhd on each indirdep
7635          * attached to this newblk to cancel them and gather journal
7636          * work.  Since we need only the oldest journal segment and
7637          * the lowest point on the tree will always have the oldest
7638          * journal segment we are free to release the segments
7639          * of any subordinates and may leave the indirdep list to
7640          * indirdep_complete() when this newblk is freed.
7641          */
7642         if (newblk->nb_state & ONDEPLIST) {
7643                 newblk->nb_state &= ~ONDEPLIST;
7644                 LIST_REMOVE(newblk, nb_deps);
7645         }
7646         if (newblk->nb_state & ONWORKLIST)
7647                 WORKLIST_REMOVE(&newblk->nb_list);
7648         /*
7649          * If the journal entry hasn't been written we save a pointer to
7650          * the dependency that frees it until it is written or the
7651          * superseding operation completes.
7652          */
7653         jnewblk = newblk->nb_jnewblk;
7654         if (jnewblk != NULL && wk != NULL) {
7655                 newblk->nb_jnewblk = NULL;
7656                 jnewblk->jn_dep = wk;
7657         }
7658         if (!LIST_EMPTY(&newblk->nb_jwork))
7659                 jwork_move(wkhd, &newblk->nb_jwork);
7660         /*
7661          * When truncating we must free the newdirblk early to remove
7662          * the pagedep from the hash before returning.
7663          */
7664         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7665                 free_newdirblk(WK_NEWDIRBLK(wk));
7666         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7667                 panic("cancel_newblk: extra newdirblk");
7668
7669         return (jnewblk);
7670 }
7671
7672 /*
7673  * Schedule the freefrag associated with a newblk to be released once
7674  * the pointers are written and the previous block is no longer needed.
7675  */
7676 static void
7677 newblk_freefrag(struct newblk *newblk)
7678 {
7679         struct freefrag *freefrag;
7680
7681         if (newblk->nb_freefrag == NULL)
7682                 return;
7683         freefrag = newblk->nb_freefrag;
7684         newblk->nb_freefrag = NULL;
7685         freefrag->ff_state |= COMPLETE;
7686         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7687                 add_to_worklist(&freefrag->ff_list, 0);
7688 }
7689
7690 /*
7691  * Free a newblk. Generate a new freefrag work request if appropriate.
7692  * This must be called after the inode pointer and any direct block pointers
7693  * are valid or fully removed via truncate or frag extension.
7694  */
7695 static void
7696 free_newblk(struct newblk *newblk)
7697 {
7698         struct indirdep *indirdep;
7699         struct worklist *wk;
7700
7701         KASSERT(newblk->nb_jnewblk == NULL,
7702             ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7703         KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7704             ("free_newblk: unclaimed newblk"));
7705         LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7706         newblk_freefrag(newblk);
7707         if (newblk->nb_state & ONDEPLIST)
7708                 LIST_REMOVE(newblk, nb_deps);
7709         if (newblk->nb_state & ONWORKLIST)
7710                 WORKLIST_REMOVE(&newblk->nb_list);
7711         LIST_REMOVE(newblk, nb_hash);
7712         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7713                 free_newdirblk(WK_NEWDIRBLK(wk));
7714         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7715                 panic("free_newblk: extra newdirblk");
7716         while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7717                 indirdep_complete(indirdep);
7718         handle_jwork(&newblk->nb_jwork);
7719         WORKITEM_FREE(newblk, D_NEWBLK);
7720 }
7721
7722 /*
7723  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7724  */
7725 static void
7726 free_newdirblk(struct newdirblk *newdirblk)
7727 {
7728         struct pagedep *pagedep;
7729         struct diradd *dap;
7730         struct worklist *wk;
7731
7732         LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7733         WORKLIST_REMOVE(&newdirblk->db_list);
7734         /*
7735          * If the pagedep is still linked onto the directory buffer
7736          * dependency chain, then some of the entries on the
7737          * pd_pendinghd list may not be committed to disk yet. In
7738          * this case, we will simply clear the NEWBLOCK flag and
7739          * let the pd_pendinghd list be processed when the pagedep
7740          * is next written. If the pagedep is no longer on the buffer
7741          * dependency chain, then all the entries on the pd_pending
7742          * list are committed to disk and we can free them here.
7743          */
7744         pagedep = newdirblk->db_pagedep;
7745         pagedep->pd_state &= ~NEWBLOCK;
7746         if ((pagedep->pd_state & ONWORKLIST) == 0) {
7747                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7748                         free_diradd(dap, NULL);
7749                 /*
7750                  * If no dependencies remain, the pagedep will be freed.
7751                  */
7752                 free_pagedep(pagedep);
7753         }
7754         /* Should only ever be one item in the list. */
7755         while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7756                 WORKLIST_REMOVE(wk);
7757                 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7758         }
7759         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7760 }
7761
7762 /*
7763  * Prepare an inode to be freed. The actual free operation is not
7764  * done until the zero'ed inode has been written to disk.
7765  */
7766 void
7767 softdep_freefile(
7768         struct vnode *pvp,
7769         ino_t ino,
7770         int mode)
7771 {
7772         struct inode *ip = VTOI(pvp);
7773         struct inodedep *inodedep;
7774         struct freefile *freefile;
7775         struct freeblks *freeblks;
7776         struct ufsmount *ump;
7777
7778         ump = ITOUMP(ip);
7779         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7780             ("softdep_freefile called on non-softdep filesystem"));
7781         /*
7782          * This sets up the inode de-allocation dependency.
7783          */
7784         freefile = malloc(sizeof(struct freefile),
7785                 M_FREEFILE, M_SOFTDEP_FLAGS);
7786         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7787         freefile->fx_mode = mode;
7788         freefile->fx_oldinum = ino;
7789         freefile->fx_devvp = ump->um_devvp;
7790         LIST_INIT(&freefile->fx_jwork);
7791         UFS_LOCK(ump);
7792         ump->um_fs->fs_pendinginodes += 1;
7793         UFS_UNLOCK(ump);
7794
7795         /*
7796          * If the inodedep does not exist, then the zero'ed inode has
7797          * been written to disk. If the allocated inode has never been
7798          * written to disk, then the on-disk inode is zero'ed. In either
7799          * case we can free the file immediately.  If the journal was
7800          * canceled before being written the inode will never make it to
7801          * disk and we must send the canceled journal entrys to
7802          * ffs_freefile() to be cleared in conjunction with the bitmap.
7803          * Any blocks waiting on the inode to write can be safely freed
7804          * here as it will never been written.
7805          */
7806         ACQUIRE_LOCK(ump);
7807         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7808         if (inodedep) {
7809                 /*
7810                  * Clear out freeblks that no longer need to reference
7811                  * this inode.
7812                  */
7813                 while ((freeblks =
7814                     TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7815                         TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7816                             fb_next);
7817                         freeblks->fb_state &= ~ONDEPLIST;
7818                 }
7819                 /*
7820                  * Remove this inode from the unlinked list.
7821                  */
7822                 if (inodedep->id_state & UNLINKED) {
7823                         /*
7824                          * Save the journal work to be freed with the bitmap
7825                          * before we clear UNLINKED.  Otherwise it can be lost
7826                          * if the inode block is written.
7827                          */
7828                         handle_bufwait(inodedep, &freefile->fx_jwork);
7829                         clear_unlinked_inodedep(inodedep);
7830                         /*
7831                          * Re-acquire inodedep as we've dropped the
7832                          * per-filesystem lock in clear_unlinked_inodedep().
7833                          */
7834                         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7835                 }
7836         }
7837         if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7838                 FREE_LOCK(ump);
7839                 handle_workitem_freefile(freefile);
7840                 return;
7841         }
7842         if ((inodedep->id_state & DEPCOMPLETE) == 0)
7843                 inodedep->id_state |= GOINGAWAY;
7844         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7845         FREE_LOCK(ump);
7846         if (ip->i_number == ino)
7847                 UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
7848 }
7849
7850 /*
7851  * Check to see if an inode has never been written to disk. If
7852  * so free the inodedep and return success, otherwise return failure.
7853  *
7854  * If we still have a bitmap dependency, then the inode has never
7855  * been written to disk. Drop the dependency as it is no longer
7856  * necessary since the inode is being deallocated. We set the
7857  * ALLCOMPLETE flags since the bitmap now properly shows that the
7858  * inode is not allocated. Even if the inode is actively being
7859  * written, it has been rolled back to its zero'ed state, so we
7860  * are ensured that a zero inode is what is on the disk. For short
7861  * lived files, this change will usually result in removing all the
7862  * dependencies from the inode so that it can be freed immediately.
7863  */
7864 static int
7865 check_inode_unwritten(struct inodedep *inodedep)
7866 {
7867
7868         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7869
7870         if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7871             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7872             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7873             !LIST_EMPTY(&inodedep->id_bufwait) ||
7874             !LIST_EMPTY(&inodedep->id_inowait) ||
7875             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7876             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7877             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7878             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7879             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7880             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7881             inodedep->id_mkdiradd != NULL ||
7882             inodedep->id_nlinkdelta != 0)
7883                 return (0);
7884         /*
7885          * Another process might be in initiate_write_inodeblock_ufs[12]
7886          * trying to allocate memory without holding "Softdep Lock".
7887          */
7888         if ((inodedep->id_state & IOSTARTED) != 0 &&
7889             inodedep->id_savedino1 == NULL)
7890                 return (0);
7891
7892         if (inodedep->id_state & ONDEPLIST)
7893                 LIST_REMOVE(inodedep, id_deps);
7894         inodedep->id_state &= ~ONDEPLIST;
7895         inodedep->id_state |= ALLCOMPLETE;
7896         inodedep->id_bmsafemap = NULL;
7897         if (inodedep->id_state & ONWORKLIST)
7898                 WORKLIST_REMOVE(&inodedep->id_list);
7899         if (inodedep->id_savedino1 != NULL) {
7900                 free(inodedep->id_savedino1, M_SAVEDINO);
7901                 inodedep->id_savedino1 = NULL;
7902         }
7903         if (free_inodedep(inodedep) == 0)
7904                 panic("check_inode_unwritten: busy inode");
7905         return (1);
7906 }
7907
7908 static int
7909 check_inodedep_free(struct inodedep *inodedep)
7910 {
7911
7912         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7913         if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7914             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7915             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7916             !LIST_EMPTY(&inodedep->id_bufwait) ||
7917             !LIST_EMPTY(&inodedep->id_inowait) ||
7918             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7919             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7920             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7921             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7922             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7923             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7924             inodedep->id_mkdiradd != NULL ||
7925             inodedep->id_nlinkdelta != 0 ||
7926             inodedep->id_savedino1 != NULL)
7927                 return (0);
7928         return (1);
7929 }
7930
7931 /*
7932  * Try to free an inodedep structure. Return 1 if it could be freed.
7933  */
7934 static int
7935 free_inodedep(struct inodedep *inodedep)
7936 {
7937
7938         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7939         if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7940             !check_inodedep_free(inodedep))
7941                 return (0);
7942         if (inodedep->id_state & ONDEPLIST)
7943                 LIST_REMOVE(inodedep, id_deps);
7944         LIST_REMOVE(inodedep, id_hash);
7945         WORKITEM_FREE(inodedep, D_INODEDEP);
7946         return (1);
7947 }
7948
7949 /*
7950  * Free the block referenced by a freework structure.  The parent freeblks
7951  * structure is released and completed when the final cg bitmap reaches
7952  * the disk.  This routine may be freeing a jnewblk which never made it to
7953  * disk in which case we do not have to wait as the operation is undone
7954  * in memory immediately.
7955  */
7956 static void
7957 freework_freeblock(struct freework *freework, uint64_t key)
7958 {
7959         struct freeblks *freeblks;
7960         struct jnewblk *jnewblk;
7961         struct ufsmount *ump;
7962         struct workhead wkhd;
7963         struct fs *fs;
7964         int bsize;
7965         int needj;
7966
7967         ump = VFSTOUFS(freework->fw_list.wk_mp);
7968         LOCK_OWNED(ump);
7969         /*
7970          * Handle partial truncate separately.
7971          */
7972         if (freework->fw_indir) {
7973                 complete_trunc_indir(freework);
7974                 return;
7975         }
7976         freeblks = freework->fw_freeblks;
7977         fs = ump->um_fs;
7978         needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7979         bsize = lfragtosize(fs, freework->fw_frags);
7980         LIST_INIT(&wkhd);
7981         /*
7982          * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7983          * on the indirblk hashtable and prevents premature freeing.
7984          */
7985         freework->fw_state |= DEPCOMPLETE;
7986         /*
7987          * SUJ needs to wait for the segment referencing freed indirect
7988          * blocks to expire so that we know the checker will not confuse
7989          * a re-allocated indirect block with its old contents.
7990          */
7991         if (needj && freework->fw_lbn <= -UFS_NDADDR)
7992                 indirblk_insert(freework);
7993         /*
7994          * If we are canceling an existing jnewblk pass it to the free
7995          * routine, otherwise pass the freeblk which will ultimately
7996          * release the freeblks.  If we're not journaling, we can just
7997          * free the freeblks immediately.
7998          */
7999         jnewblk = freework->fw_jnewblk;
8000         if (jnewblk != NULL) {
8001                 cancel_jnewblk(jnewblk, &wkhd);
8002                 needj = 0;
8003         } else if (needj) {
8004                 freework->fw_state |= DELAYEDFREE;
8005                 freeblks->fb_cgwait++;
8006                 WORKLIST_INSERT(&wkhd, &freework->fw_list);
8007         }
8008         FREE_LOCK(ump);
8009         freeblks_free(ump, freeblks, btodb(bsize));
8010         CTR4(KTR_SUJ,
8011             "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
8012             freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
8013         ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
8014             freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
8015         ACQUIRE_LOCK(ump);
8016         /*
8017          * The jnewblk will be discarded and the bits in the map never
8018          * made it to disk.  We can immediately free the freeblk.
8019          */
8020         if (needj == 0)
8021                 handle_written_freework(freework);
8022 }
8023
8024 /*
8025  * We enqueue freework items that need processing back on the freeblks and
8026  * add the freeblks to the worklist.  This makes it easier to find all work
8027  * required to flush a truncation in process_truncates().
8028  */
8029 static void
8030 freework_enqueue(struct freework *freework)
8031 {
8032         struct freeblks *freeblks;
8033
8034         freeblks = freework->fw_freeblks;
8035         if ((freework->fw_state & INPROGRESS) == 0)
8036                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
8037         if ((freeblks->fb_state &
8038             (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
8039             LIST_EMPTY(&freeblks->fb_jblkdephd))
8040                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
8041 }
8042
8043 /*
8044  * Start, continue, or finish the process of freeing an indirect block tree.
8045  * The free operation may be paused at any point with fw_off containing the
8046  * offset to restart from.  This enables us to implement some flow control
8047  * for large truncates which may fan out and generate a huge number of
8048  * dependencies.
8049  */
8050 static void
8051 handle_workitem_indirblk(struct freework *freework)
8052 {
8053         struct freeblks *freeblks;
8054         struct ufsmount *ump;
8055         struct fs *fs;
8056
8057         freeblks = freework->fw_freeblks;
8058         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8059         fs = ump->um_fs;
8060         if (freework->fw_state & DEPCOMPLETE) {
8061                 handle_written_freework(freework);
8062                 return;
8063         }
8064         if (freework->fw_off == NINDIR(fs)) {
8065                 freework_freeblock(freework, SINGLETON_KEY);
8066                 return;
8067         }
8068         freework->fw_state |= INPROGRESS;
8069         FREE_LOCK(ump);
8070         indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
8071             freework->fw_lbn);
8072         ACQUIRE_LOCK(ump);
8073 }
8074
8075 /*
8076  * Called when a freework structure attached to a cg buf is written.  The
8077  * ref on either the parent or the freeblks structure is released and
8078  * the freeblks is added back to the worklist if there is more work to do.
8079  */
8080 static void
8081 handle_written_freework(struct freework *freework)
8082 {
8083         struct freeblks *freeblks;
8084         struct freework *parent;
8085
8086         freeblks = freework->fw_freeblks;
8087         parent = freework->fw_parent;
8088         if (freework->fw_state & DELAYEDFREE)
8089                 freeblks->fb_cgwait--;
8090         freework->fw_state |= COMPLETE;
8091         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
8092                 WORKITEM_FREE(freework, D_FREEWORK);
8093         if (parent) {
8094                 if (--parent->fw_ref == 0)
8095                         freework_enqueue(parent);
8096                 return;
8097         }
8098         if (--freeblks->fb_ref != 0)
8099                 return;
8100         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
8101             ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
8102                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
8103 }
8104
8105 /*
8106  * This workitem routine performs the block de-allocation.
8107  * The workitem is added to the pending list after the updated
8108  * inode block has been written to disk.  As mentioned above,
8109  * checks regarding the number of blocks de-allocated (compared
8110  * to the number of blocks allocated for the file) are also
8111  * performed in this function.
8112  */
8113 static int
8114 handle_workitem_freeblocks(struct freeblks *freeblks, int flags)
8115 {
8116         struct freework *freework;
8117         struct newblk *newblk;
8118         struct allocindir *aip;
8119         struct ufsmount *ump;
8120         struct worklist *wk;
8121         uint64_t key;
8122
8123         KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
8124             ("handle_workitem_freeblocks: Journal entries not written."));
8125         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8126         key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8127         ACQUIRE_LOCK(ump);
8128         while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
8129                 WORKLIST_REMOVE(wk);
8130                 switch (wk->wk_type) {
8131                 case D_DIRREM:
8132                         wk->wk_state |= COMPLETE;
8133                         add_to_worklist(wk, 0);
8134                         continue;
8135
8136                 case D_ALLOCDIRECT:
8137                         free_newblk(WK_NEWBLK(wk));
8138                         continue;
8139
8140                 case D_ALLOCINDIR:
8141                         aip = WK_ALLOCINDIR(wk);
8142                         freework = NULL;
8143                         if (aip->ai_state & DELAYEDFREE) {
8144                                 FREE_LOCK(ump);
8145                                 freework = newfreework(ump, freeblks, NULL,
8146                                     aip->ai_lbn, aip->ai_newblkno,
8147                                     ump->um_fs->fs_frag, 0, 0);
8148                                 ACQUIRE_LOCK(ump);
8149                         }
8150                         newblk = WK_NEWBLK(wk);
8151                         if (newblk->nb_jnewblk) {
8152                                 freework->fw_jnewblk = newblk->nb_jnewblk;
8153                                 newblk->nb_jnewblk->jn_dep = &freework->fw_list;
8154                                 newblk->nb_jnewblk = NULL;
8155                         }
8156                         free_newblk(newblk);
8157                         continue;
8158
8159                 case D_FREEWORK:
8160                         freework = WK_FREEWORK(wk);
8161                         if (freework->fw_lbn <= -UFS_NDADDR)
8162                                 handle_workitem_indirblk(freework);
8163                         else
8164                                 freework_freeblock(freework, key);
8165                         continue;
8166                 default:
8167                         panic("handle_workitem_freeblocks: Unknown type %s",
8168                             TYPENAME(wk->wk_type));
8169                 }
8170         }
8171         if (freeblks->fb_ref != 0) {
8172                 freeblks->fb_state &= ~INPROGRESS;
8173                 wake_worklist(&freeblks->fb_list);
8174                 freeblks = NULL;
8175         }
8176         FREE_LOCK(ump);
8177         ffs_blkrelease_finish(ump, key);
8178         if (freeblks)
8179                 return handle_complete_freeblocks(freeblks, flags);
8180         return (0);
8181 }
8182
8183 /*
8184  * Handle completion of block free via truncate.  This allows fs_pending
8185  * to track the actual free block count more closely than if we only updated
8186  * it at the end.  We must be careful to handle cases where the block count
8187  * on free was incorrect.
8188  */
8189 static void
8190 freeblks_free(struct ufsmount *ump,
8191         struct freeblks *freeblks,
8192         int blocks)
8193 {
8194         struct fs *fs;
8195         ufs2_daddr_t remain;
8196
8197         UFS_LOCK(ump);
8198         remain = -freeblks->fb_chkcnt;
8199         freeblks->fb_chkcnt += blocks;
8200         if (remain > 0) {
8201                 if (remain < blocks)
8202                         blocks = remain;
8203                 fs = ump->um_fs;
8204                 fs->fs_pendingblocks -= blocks;
8205         }
8206         UFS_UNLOCK(ump);
8207 }
8208
8209 /*
8210  * Once all of the freework workitems are complete we can retire the
8211  * freeblocks dependency and any journal work awaiting completion.  This
8212  * can not be called until all other dependencies are stable on disk.
8213  */
8214 static int
8215 handle_complete_freeblocks(struct freeblks *freeblks, int flags)
8216 {
8217         struct inodedep *inodedep;
8218         struct inode *ip;
8219         struct vnode *vp;
8220         struct fs *fs;
8221         struct ufsmount *ump;
8222         ufs2_daddr_t spare;
8223
8224         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8225         fs = ump->um_fs;
8226         flags = LK_EXCLUSIVE | flags;
8227         spare = freeblks->fb_chkcnt;
8228
8229         /*
8230          * If we did not release the expected number of blocks we may have
8231          * to adjust the inode block count here.  Only do so if it wasn't
8232          * a truncation to zero and the modrev still matches.
8233          */
8234         if (spare && freeblks->fb_len != 0) {
8235                 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8236                     flags, &vp, FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP) != 0)
8237                         return (EBUSY);
8238                 ip = VTOI(vp);
8239                 if (ip->i_mode == 0) {
8240                         vgone(vp);
8241                 } else if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
8242                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
8243                         UFS_INODE_SET_FLAG(ip, IN_CHANGE);
8244                         /*
8245                          * We must wait so this happens before the
8246                          * journal is reclaimed.
8247                          */
8248                         ffs_update(vp, 1);
8249                 }
8250                 vput(vp);
8251         }
8252         if (spare < 0) {
8253                 UFS_LOCK(ump);
8254                 fs->fs_pendingblocks += spare;
8255                 UFS_UNLOCK(ump);
8256         }
8257 #ifdef QUOTA
8258         /* Handle spare. */
8259         if (spare)
8260                 quotaadj(freeblks->fb_quota, ump, -spare);
8261         quotarele(freeblks->fb_quota);
8262 #endif
8263         ACQUIRE_LOCK(ump);
8264         if (freeblks->fb_state & ONDEPLIST) {
8265                 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8266                     0, &inodedep);
8267                 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8268                 freeblks->fb_state &= ~ONDEPLIST;
8269                 if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8270                         free_inodedep(inodedep);
8271         }
8272         /*
8273          * All of the freeblock deps must be complete prior to this call
8274          * so it's now safe to complete earlier outstanding journal entries.
8275          */
8276         handle_jwork(&freeblks->fb_jwork);
8277         WORKITEM_FREE(freeblks, D_FREEBLKS);
8278         FREE_LOCK(ump);
8279         return (0);
8280 }
8281
8282 /*
8283  * Release blocks associated with the freeblks and stored in the indirect
8284  * block dbn. If level is greater than SINGLE, the block is an indirect block
8285  * and recursive calls to indirtrunc must be used to cleanse other indirect
8286  * blocks.
8287  *
8288  * This handles partial and complete truncation of blocks.  Partial is noted
8289  * with goingaway == 0.  In this case the freework is completed after the
8290  * zero'd indirects are written to disk.  For full truncation the freework
8291  * is completed after the block is freed.
8292  */
8293 static void
8294 indir_trunc(struct freework *freework,
8295         ufs2_daddr_t dbn,
8296         ufs_lbn_t lbn)
8297 {
8298         struct freework *nfreework;
8299         struct workhead wkhd;
8300         struct freeblks *freeblks;
8301         struct buf *bp;
8302         struct fs *fs;
8303         struct indirdep *indirdep;
8304         struct mount *mp;
8305         struct ufsmount *ump;
8306         ufs1_daddr_t *bap1;
8307         ufs2_daddr_t nb, nnb, *bap2;
8308         ufs_lbn_t lbnadd, nlbn;
8309         uint64_t key;
8310         int nblocks, ufs1fmt, freedblocks;
8311         int goingaway, freedeps, needj, level, cnt, i, error;
8312
8313         freeblks = freework->fw_freeblks;
8314         mp = freeblks->fb_list.wk_mp;
8315         ump = VFSTOUFS(mp);
8316         fs = ump->um_fs;
8317         /*
8318          * Get buffer of block pointers to be freed.  There are three cases:
8319          *
8320          * 1) Partial truncate caches the indirdep pointer in the freework
8321          *    which provides us a back copy to the save bp which holds the
8322          *    pointers we want to clear.  When this completes the zero
8323          *    pointers are written to the real copy.
8324          * 2) The indirect is being completely truncated, cancel_indirdep()
8325          *    eliminated the real copy and placed the indirdep on the saved
8326          *    copy.  The indirdep and buf are discarded when this completes.
8327          * 3) The indirect was not in memory, we read a copy off of the disk
8328          *    using the devvp and drop and invalidate the buffer when we're
8329          *    done.
8330          */
8331         goingaway = 1;
8332         indirdep = NULL;
8333         if (freework->fw_indir != NULL) {
8334                 goingaway = 0;
8335                 indirdep = freework->fw_indir;
8336                 bp = indirdep->ir_savebp;
8337                 if (bp == NULL || bp->b_blkno != dbn)
8338                         panic("indir_trunc: Bad saved buf %p blkno %jd",
8339                             bp, (intmax_t)dbn);
8340         } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8341                 /*
8342                  * The lock prevents the buf dep list from changing and
8343                  * indirects on devvp should only ever have one dependency.
8344                  */
8345                 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8346                 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8347                         panic("indir_trunc: Bad indirdep %p from buf %p",
8348                             indirdep, bp);
8349         } else {
8350                 error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn,
8351                     (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
8352                 if (error)
8353                         return;
8354         }
8355         ACQUIRE_LOCK(ump);
8356         /* Protects against a race with complete_trunc_indir(). */
8357         freework->fw_state &= ~INPROGRESS;
8358         /*
8359          * If we have an indirdep we need to enforce the truncation order
8360          * and discard it when it is complete.
8361          */
8362         if (indirdep) {
8363                 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8364                     !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8365                         /*
8366                          * Add the complete truncate to the list on the
8367                          * indirdep to enforce in-order processing.
8368                          */
8369                         if (freework->fw_indir == NULL)
8370                                 TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8371                                     freework, fw_next);
8372                         FREE_LOCK(ump);
8373                         return;
8374                 }
8375                 /*
8376                  * If we're goingaway, free the indirdep.  Otherwise it will
8377                  * linger until the write completes.
8378                  */
8379                 if (goingaway) {
8380                         KASSERT(indirdep->ir_savebp == bp,
8381                             ("indir_trunc: losing ir_savebp %p",
8382                             indirdep->ir_savebp));
8383                         indirdep->ir_savebp = NULL;
8384                         free_indirdep(indirdep);
8385                 }
8386         }
8387         FREE_LOCK(ump);
8388         /* Initialize pointers depending on block size. */
8389         if (ump->um_fstype == UFS1) {
8390                 bap1 = (ufs1_daddr_t *)bp->b_data;
8391                 nb = bap1[freework->fw_off];
8392                 ufs1fmt = 1;
8393                 bap2 = NULL;
8394         } else {
8395                 bap2 = (ufs2_daddr_t *)bp->b_data;
8396                 nb = bap2[freework->fw_off];
8397                 ufs1fmt = 0;
8398                 bap1 = NULL;
8399         }
8400         level = lbn_level(lbn);
8401         needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8402         lbnadd = lbn_offset(fs, level);
8403         nblocks = btodb(fs->fs_bsize);
8404         nfreework = freework;
8405         freedeps = 0;
8406         cnt = 0;
8407         /*
8408          * Reclaim blocks.  Traverses into nested indirect levels and
8409          * arranges for the current level to be freed when subordinates
8410          * are free when journaling.
8411          */
8412         key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8413         for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8414                 if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb,
8415                     fs->fs_bsize) != 0)
8416                         nb = 0;
8417                 if (i != NINDIR(fs) - 1) {
8418                         if (ufs1fmt)
8419                                 nnb = bap1[i+1];
8420                         else
8421                                 nnb = bap2[i+1];
8422                 } else
8423                         nnb = 0;
8424                 if (nb == 0)
8425                         continue;
8426                 cnt++;
8427                 if (level != 0) {
8428                         nlbn = (lbn + 1) - (i * lbnadd);
8429                         if (needj != 0) {
8430                                 nfreework = newfreework(ump, freeblks, freework,
8431                                     nlbn, nb, fs->fs_frag, 0, 0);
8432                                 freedeps++;
8433                         }
8434                         indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8435                 } else {
8436                         struct freedep *freedep;
8437
8438                         /*
8439                          * Attempt to aggregate freedep dependencies for
8440                          * all blocks being released to the same CG.
8441                          */
8442                         LIST_INIT(&wkhd);
8443                         if (needj != 0 &&
8444                             (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8445                                 freedep = newfreedep(freework);
8446                                 WORKLIST_INSERT_UNLOCKED(&wkhd,
8447                                     &freedep->fd_list);
8448                                 freedeps++;
8449                         }
8450                         CTR3(KTR_SUJ,
8451                             "indir_trunc: ino %jd blkno %jd size %d",
8452                             freeblks->fb_inum, nb, fs->fs_bsize);
8453                         ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8454                             fs->fs_bsize, freeblks->fb_inum,
8455                             freeblks->fb_vtype, &wkhd, key);
8456                 }
8457         }
8458         ffs_blkrelease_finish(ump, key);
8459         if (goingaway) {
8460                 bp->b_flags |= B_INVAL | B_NOCACHE;
8461                 brelse(bp);
8462         }
8463         freedblocks = 0;
8464         if (level == 0)
8465                 freedblocks = (nblocks * cnt);
8466         if (needj == 0)
8467                 freedblocks += nblocks;
8468         freeblks_free(ump, freeblks, freedblocks);
8469         /*
8470          * If we are journaling set up the ref counts and offset so this
8471          * indirect can be completed when its children are free.
8472          */
8473         if (needj) {
8474                 ACQUIRE_LOCK(ump);
8475                 freework->fw_off = i;
8476                 freework->fw_ref += freedeps;
8477                 freework->fw_ref -= NINDIR(fs) + 1;
8478                 if (level == 0)
8479                         freeblks->fb_cgwait += freedeps;
8480                 if (freework->fw_ref == 0)
8481                         freework_freeblock(freework, SINGLETON_KEY);
8482                 FREE_LOCK(ump);
8483                 return;
8484         }
8485         /*
8486          * If we're not journaling we can free the indirect now.
8487          */
8488         dbn = dbtofsb(fs, dbn);
8489         CTR3(KTR_SUJ,
8490             "indir_trunc 2: ino %jd blkno %jd size %d",
8491             freeblks->fb_inum, dbn, fs->fs_bsize);
8492         ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8493             freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
8494         /* Non SUJ softdep does single-threaded truncations. */
8495         if (freework->fw_blkno == dbn) {
8496                 freework->fw_state |= ALLCOMPLETE;
8497                 ACQUIRE_LOCK(ump);
8498                 handle_written_freework(freework);
8499                 FREE_LOCK(ump);
8500         }
8501         return;
8502 }
8503
8504 /*
8505  * Cancel an allocindir when it is removed via truncation.  When bp is not
8506  * NULL the indirect never appeared on disk and is scheduled to be freed
8507  * independently of the indir so we can more easily track journal work.
8508  */
8509 static void
8510 cancel_allocindir(
8511         struct allocindir *aip,
8512         struct buf *bp,
8513         struct freeblks *freeblks,
8514         int trunc)
8515 {
8516         struct indirdep *indirdep;
8517         struct freefrag *freefrag;
8518         struct newblk *newblk;
8519
8520         newblk = (struct newblk *)aip;
8521         LIST_REMOVE(aip, ai_next);
8522         /*
8523          * We must eliminate the pointer in bp if it must be freed on its
8524          * own due to partial truncate or pending journal work.
8525          */
8526         if (bp && (trunc || newblk->nb_jnewblk)) {
8527                 /*
8528                  * Clear the pointer and mark the aip to be freed
8529                  * directly if it never existed on disk.
8530                  */
8531                 aip->ai_state |= DELAYEDFREE;
8532                 indirdep = aip->ai_indirdep;
8533                 if (indirdep->ir_state & UFS1FMT)
8534                         ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8535                 else
8536                         ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8537         }
8538         /*
8539          * When truncating the previous pointer will be freed via
8540          * savedbp.  Eliminate the freefrag which would dup free.
8541          */
8542         if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8543                 newblk->nb_freefrag = NULL;
8544                 if (freefrag->ff_jdep)
8545                         cancel_jfreefrag(
8546                             WK_JFREEFRAG(freefrag->ff_jdep));
8547                 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8548                 WORKITEM_FREE(freefrag, D_FREEFRAG);
8549         }
8550         /*
8551          * If the journal hasn't been written the jnewblk must be passed
8552          * to the call to ffs_blkfree that reclaims the space.  We accomplish
8553          * this by leaving the journal dependency on the newblk to be freed
8554          * when a freework is created in handle_workitem_freeblocks().
8555          */
8556         cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8557         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8558 }
8559
8560 /*
8561  * Create the mkdir dependencies for . and .. in a new directory.  Link them
8562  * in to a newdirblk so any subsequent additions are tracked properly.  The
8563  * caller is responsible for adding the mkdir1 dependency to the journal
8564  * and updating id_mkdiradd.  This function returns with the per-filesystem
8565  * lock held.
8566  */
8567 static struct mkdir *
8568 setup_newdir(
8569         struct diradd *dap,
8570         ino_t newinum,
8571         ino_t dinum,
8572         struct buf *newdirbp,
8573         struct mkdir **mkdirp)
8574 {
8575         struct newblk *newblk;
8576         struct pagedep *pagedep;
8577         struct inodedep *inodedep;
8578         struct newdirblk *newdirblk;
8579         struct mkdir *mkdir1, *mkdir2;
8580         struct worklist *wk;
8581         struct jaddref *jaddref;
8582         struct ufsmount *ump;
8583         struct mount *mp;
8584
8585         mp = dap->da_list.wk_mp;
8586         ump = VFSTOUFS(mp);
8587         newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8588             M_SOFTDEP_FLAGS);
8589         workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8590         LIST_INIT(&newdirblk->db_mkdir);
8591         mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8592         workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8593         mkdir1->md_state = ATTACHED | MKDIR_BODY;
8594         mkdir1->md_diradd = dap;
8595         mkdir1->md_jaddref = NULL;
8596         mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8597         workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8598         mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8599         mkdir2->md_diradd = dap;
8600         mkdir2->md_jaddref = NULL;
8601         if (MOUNTEDSUJ(mp) == 0) {
8602                 mkdir1->md_state |= DEPCOMPLETE;
8603                 mkdir2->md_state |= DEPCOMPLETE;
8604         }
8605         /*
8606          * Dependency on "." and ".." being written to disk.
8607          */
8608         mkdir1->md_buf = newdirbp;
8609         ACQUIRE_LOCK(VFSTOUFS(mp));
8610         LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8611         /*
8612          * We must link the pagedep, allocdirect, and newdirblk for
8613          * the initial file page so the pointer to the new directory
8614          * is not written until the directory contents are live and
8615          * any subsequent additions are not marked live until the
8616          * block is reachable via the inode.
8617          */
8618         if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8619                 panic("setup_newdir: lost pagedep");
8620         LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8621                 if (wk->wk_type == D_ALLOCDIRECT)
8622                         break;
8623         if (wk == NULL)
8624                 panic("setup_newdir: lost allocdirect");
8625         if (pagedep->pd_state & NEWBLOCK)
8626                 panic("setup_newdir: NEWBLOCK already set");
8627         newblk = WK_NEWBLK(wk);
8628         pagedep->pd_state |= NEWBLOCK;
8629         pagedep->pd_newdirblk = newdirblk;
8630         newdirblk->db_pagedep = pagedep;
8631         WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8632         WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8633         /*
8634          * Look up the inodedep for the parent directory so that we
8635          * can link mkdir2 into the pending dotdot jaddref or
8636          * the inode write if there is none.  If the inode is
8637          * ALLCOMPLETE and no jaddref is present all dependencies have
8638          * been satisfied and mkdir2 can be freed.
8639          */
8640         inodedep_lookup(mp, dinum, 0, &inodedep);
8641         if (MOUNTEDSUJ(mp)) {
8642                 if (inodedep == NULL)
8643                         panic("setup_newdir: Lost parent.");
8644                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8645                     inoreflst);
8646                 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8647                     (jaddref->ja_state & MKDIR_PARENT),
8648                     ("setup_newdir: bad dotdot jaddref %p", jaddref));
8649                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8650                 mkdir2->md_jaddref = jaddref;
8651                 jaddref->ja_mkdir = mkdir2;
8652         } else if (inodedep == NULL ||
8653             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8654                 dap->da_state &= ~MKDIR_PARENT;
8655                 WORKITEM_FREE(mkdir2, D_MKDIR);
8656                 mkdir2 = NULL;
8657         } else {
8658                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8659                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8660         }
8661         *mkdirp = mkdir2;
8662
8663         return (mkdir1);
8664 }
8665
8666 /*
8667  * Directory entry addition dependencies.
8668  *
8669  * When adding a new directory entry, the inode (with its incremented link
8670  * count) must be written to disk before the directory entry's pointer to it.
8671  * Also, if the inode is newly allocated, the corresponding freemap must be
8672  * updated (on disk) before the directory entry's pointer. These requirements
8673  * are met via undo/redo on the directory entry's pointer, which consists
8674  * simply of the inode number.
8675  *
8676  * As directory entries are added and deleted, the free space within a
8677  * directory block can become fragmented.  The ufs filesystem will compact
8678  * a fragmented directory block to make space for a new entry. When this
8679  * occurs, the offsets of previously added entries change. Any "diradd"
8680  * dependency structures corresponding to these entries must be updated with
8681  * the new offsets.
8682  */
8683
8684 /*
8685  * This routine is called after the in-memory inode's link
8686  * count has been incremented, but before the directory entry's
8687  * pointer to the inode has been set.
8688  */
8689 int
8690 softdep_setup_directory_add(
8691         struct buf *bp,         /* buffer containing directory block */
8692         struct inode *dp,       /* inode for directory */
8693         off_t diroffset,        /* offset of new entry in directory */
8694         ino_t newinum,          /* inode referenced by new directory entry */
8695         struct buf *newdirbp,   /* non-NULL => contents of new mkdir */
8696         int isnewblk)           /* entry is in a newly allocated block */
8697 {
8698         int offset;             /* offset of new entry within directory block */
8699         ufs_lbn_t lbn;          /* block in directory containing new entry */
8700         struct fs *fs;
8701         struct diradd *dap;
8702         struct newblk *newblk;
8703         struct pagedep *pagedep;
8704         struct inodedep *inodedep;
8705         struct newdirblk *newdirblk;
8706         struct mkdir *mkdir1, *mkdir2;
8707         struct jaddref *jaddref;
8708         struct ufsmount *ump;
8709         struct mount *mp;
8710         int isindir;
8711
8712         mp = ITOVFS(dp);
8713         ump = VFSTOUFS(mp);
8714         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8715             ("softdep_setup_directory_add called on non-softdep filesystem"));
8716         /*
8717          * Whiteouts have no dependencies.
8718          */
8719         if (newinum == UFS_WINO) {
8720                 if (newdirbp != NULL)
8721                         bdwrite(newdirbp);
8722                 return (0);
8723         }
8724         jaddref = NULL;
8725         mkdir1 = mkdir2 = NULL;
8726         fs = ump->um_fs;
8727         lbn = lblkno(fs, diroffset);
8728         offset = blkoff(fs, diroffset);
8729         dap = malloc(sizeof(struct diradd), M_DIRADD,
8730                 M_SOFTDEP_FLAGS|M_ZERO);
8731         workitem_alloc(&dap->da_list, D_DIRADD, mp);
8732         dap->da_offset = offset;
8733         dap->da_newinum = newinum;
8734         dap->da_state = ATTACHED;
8735         LIST_INIT(&dap->da_jwork);
8736         isindir = bp->b_lblkno >= UFS_NDADDR;
8737         newdirblk = NULL;
8738         if (isnewblk &&
8739             (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8740                 newdirblk = malloc(sizeof(struct newdirblk),
8741                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8742                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8743                 LIST_INIT(&newdirblk->db_mkdir);
8744         }
8745         /*
8746          * If we're creating a new directory setup the dependencies and set
8747          * the dap state to wait for them.  Otherwise it's COMPLETE and
8748          * we can move on.
8749          */
8750         if (newdirbp == NULL) {
8751                 dap->da_state |= DEPCOMPLETE;
8752                 ACQUIRE_LOCK(ump);
8753         } else {
8754                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8755                 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8756                     &mkdir2);
8757         }
8758         /*
8759          * Link into parent directory pagedep to await its being written.
8760          */
8761         pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8762 #ifdef INVARIANTS
8763         if (diradd_lookup(pagedep, offset) != NULL)
8764                 panic("softdep_setup_directory_add: %p already at off %d\n",
8765                     diradd_lookup(pagedep, offset), offset);
8766 #endif
8767         dap->da_pagedep = pagedep;
8768         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8769             da_pdlist);
8770         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8771         /*
8772          * If we're journaling, link the diradd into the jaddref so it
8773          * may be completed after the journal entry is written.  Otherwise,
8774          * link the diradd into its inodedep.  If the inode is not yet
8775          * written place it on the bufwait list, otherwise do the post-inode
8776          * write processing to put it on the id_pendinghd list.
8777          */
8778         if (MOUNTEDSUJ(mp)) {
8779                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8780                     inoreflst);
8781                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8782                     ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8783                 jaddref->ja_diroff = diroffset;
8784                 jaddref->ja_diradd = dap;
8785                 add_to_journal(&jaddref->ja_list);
8786         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8787                 diradd_inode_written(dap, inodedep);
8788         else
8789                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8790         /*
8791          * Add the journal entries for . and .. links now that the primary
8792          * link is written.
8793          */
8794         if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8795                 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8796                     inoreflst, if_deps);
8797                 KASSERT(jaddref != NULL &&
8798                     jaddref->ja_ino == jaddref->ja_parent &&
8799                     (jaddref->ja_state & MKDIR_BODY),
8800                     ("softdep_setup_directory_add: bad dot jaddref %p",
8801                     jaddref));
8802                 mkdir1->md_jaddref = jaddref;
8803                 jaddref->ja_mkdir = mkdir1;
8804                 /*
8805                  * It is important that the dotdot journal entry
8806                  * is added prior to the dot entry since dot writes
8807                  * both the dot and dotdot links.  These both must
8808                  * be added after the primary link for the journal
8809                  * to remain consistent.
8810                  */
8811                 add_to_journal(&mkdir2->md_jaddref->ja_list);
8812                 add_to_journal(&jaddref->ja_list);
8813         }
8814         /*
8815          * If we are adding a new directory remember this diradd so that if
8816          * we rename it we can keep the dot and dotdot dependencies.  If
8817          * we are adding a new name for an inode that has a mkdiradd we
8818          * must be in rename and we have to move the dot and dotdot
8819          * dependencies to this new name.  The old name is being orphaned
8820          * soon.
8821          */
8822         if (mkdir1 != NULL) {
8823                 if (inodedep->id_mkdiradd != NULL)
8824                         panic("softdep_setup_directory_add: Existing mkdir");
8825                 inodedep->id_mkdiradd = dap;
8826         } else if (inodedep->id_mkdiradd)
8827                 merge_diradd(inodedep, dap);
8828         if (newdirblk != NULL) {
8829                 /*
8830                  * There is nothing to do if we are already tracking
8831                  * this block.
8832                  */
8833                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
8834                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8835                         FREE_LOCK(ump);
8836                         return (0);
8837                 }
8838                 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8839                     == 0)
8840                         panic("softdep_setup_directory_add: lost entry");
8841                 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8842                 pagedep->pd_state |= NEWBLOCK;
8843                 pagedep->pd_newdirblk = newdirblk;
8844                 newdirblk->db_pagedep = pagedep;
8845                 FREE_LOCK(ump);
8846                 /*
8847                  * If we extended into an indirect signal direnter to sync.
8848                  */
8849                 if (isindir)
8850                         return (1);
8851                 return (0);
8852         }
8853         FREE_LOCK(ump);
8854         return (0);
8855 }
8856
8857 /*
8858  * This procedure is called to change the offset of a directory
8859  * entry when compacting a directory block which must be owned
8860  * exclusively by the caller. Note that the actual entry movement
8861  * must be done in this procedure to ensure that no I/O completions
8862  * occur while the move is in progress.
8863  */
8864 void
8865 softdep_change_directoryentry_offset(
8866         struct buf *bp,         /* Buffer holding directory block. */
8867         struct inode *dp,       /* inode for directory */
8868         caddr_t base,           /* address of dp->i_offset */
8869         caddr_t oldloc,         /* address of old directory location */
8870         caddr_t newloc,         /* address of new directory location */
8871         int entrysize)          /* size of directory entry */
8872 {
8873         int offset, oldoffset, newoffset;
8874         struct pagedep *pagedep;
8875         struct jmvref *jmvref;
8876         struct diradd *dap;
8877         struct direct *de;
8878         struct mount *mp;
8879         struct ufsmount *ump;
8880         ufs_lbn_t lbn;
8881         int flags;
8882
8883         mp = ITOVFS(dp);
8884         ump = VFSTOUFS(mp);
8885         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8886             ("softdep_change_directoryentry_offset called on "
8887              "non-softdep filesystem"));
8888         de = (struct direct *)oldloc;
8889         jmvref = NULL;
8890         flags = 0;
8891         /*
8892          * Moves are always journaled as it would be too complex to
8893          * determine if any affected adds or removes are present in the
8894          * journal.
8895          */
8896         if (MOUNTEDSUJ(mp)) {
8897                 flags = DEPALLOC;
8898                 jmvref = newjmvref(dp, de->d_ino,
8899                     I_OFFSET(dp) + (oldloc - base),
8900                     I_OFFSET(dp) + (newloc - base));
8901         }
8902         lbn = lblkno(ump->um_fs, I_OFFSET(dp));
8903         offset = blkoff(ump->um_fs, I_OFFSET(dp));
8904         oldoffset = offset + (oldloc - base);
8905         newoffset = offset + (newloc - base);
8906         ACQUIRE_LOCK(ump);
8907         if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8908                 goto done;
8909         dap = diradd_lookup(pagedep, oldoffset);
8910         if (dap) {
8911                 dap->da_offset = newoffset;
8912                 newoffset = DIRADDHASH(newoffset);
8913                 oldoffset = DIRADDHASH(oldoffset);
8914                 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8915                     newoffset != oldoffset) {
8916                         LIST_REMOVE(dap, da_pdlist);
8917                         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8918                             dap, da_pdlist);
8919                 }
8920         }
8921 done:
8922         if (jmvref) {
8923                 jmvref->jm_pagedep = pagedep;
8924                 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8925                 add_to_journal(&jmvref->jm_list);
8926         }
8927         bcopy(oldloc, newloc, entrysize);
8928         FREE_LOCK(ump);
8929 }
8930
8931 /*
8932  * Move the mkdir dependencies and journal work from one diradd to another
8933  * when renaming a directory.  The new name must depend on the mkdir deps
8934  * completing as the old name did.  Directories can only have one valid link
8935  * at a time so one must be canonical.
8936  */
8937 static void
8938 merge_diradd(struct inodedep *inodedep, struct diradd *newdap)
8939 {
8940         struct diradd *olddap;
8941         struct mkdir *mkdir, *nextmd;
8942         struct ufsmount *ump;
8943         short state;
8944
8945         olddap = inodedep->id_mkdiradd;
8946         inodedep->id_mkdiradd = newdap;
8947         if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8948                 newdap->da_state &= ~DEPCOMPLETE;
8949                 ump = VFSTOUFS(inodedep->id_list.wk_mp);
8950                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8951                      mkdir = nextmd) {
8952                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8953                         if (mkdir->md_diradd != olddap)
8954                                 continue;
8955                         mkdir->md_diradd = newdap;
8956                         state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8957                         newdap->da_state |= state;
8958                         olddap->da_state &= ~state;
8959                         if ((olddap->da_state &
8960                             (MKDIR_PARENT | MKDIR_BODY)) == 0)
8961                                 break;
8962                 }
8963                 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8964                         panic("merge_diradd: unfound ref");
8965         }
8966         /*
8967          * Any mkdir related journal items are not safe to be freed until
8968          * the new name is stable.
8969          */
8970         jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8971         olddap->da_state |= DEPCOMPLETE;
8972         complete_diradd(olddap);
8973 }
8974
8975 /*
8976  * Move the diradd to the pending list when all diradd dependencies are
8977  * complete.
8978  */
8979 static void
8980 complete_diradd(struct diradd *dap)
8981 {
8982         struct pagedep *pagedep;
8983
8984         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8985                 if (dap->da_state & DIRCHG)
8986                         pagedep = dap->da_previous->dm_pagedep;
8987                 else
8988                         pagedep = dap->da_pagedep;
8989                 LIST_REMOVE(dap, da_pdlist);
8990                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8991         }
8992 }
8993
8994 /*
8995  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8996  * add entries and conditionally journal the remove.
8997  */
8998 static void
8999 cancel_diradd(
9000         struct diradd *dap,
9001         struct dirrem *dirrem,
9002         struct jremref *jremref,
9003         struct jremref *dotremref,
9004         struct jremref *dotdotremref)
9005 {
9006         struct inodedep *inodedep;
9007         struct jaddref *jaddref;
9008         struct inoref *inoref;
9009         struct ufsmount *ump;
9010         struct mkdir *mkdir;
9011
9012         /*
9013          * If no remove references were allocated we're on a non-journaled
9014          * filesystem and can skip the cancel step.
9015          */
9016         if (jremref == NULL) {
9017                 free_diradd(dap, NULL);
9018                 return;
9019         }
9020         /*
9021          * Cancel the primary name an free it if it does not require
9022          * journaling.
9023          */
9024         if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
9025             0, &inodedep) != 0) {
9026                 /* Abort the addref that reference this diradd.  */
9027                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9028                         if (inoref->if_list.wk_type != D_JADDREF)
9029                                 continue;
9030                         jaddref = (struct jaddref *)inoref;
9031                         if (jaddref->ja_diradd != dap)
9032                                 continue;
9033                         if (cancel_jaddref(jaddref, inodedep,
9034                             &dirrem->dm_jwork) == 0) {
9035                                 free_jremref(jremref);
9036                                 jremref = NULL;
9037                         }
9038                         break;
9039                 }
9040         }
9041         /*
9042          * Cancel subordinate names and free them if they do not require
9043          * journaling.
9044          */
9045         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9046                 ump = VFSTOUFS(dap->da_list.wk_mp);
9047                 LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
9048                         if (mkdir->md_diradd != dap)
9049                                 continue;
9050                         if ((jaddref = mkdir->md_jaddref) == NULL)
9051                                 continue;
9052                         mkdir->md_jaddref = NULL;
9053                         if (mkdir->md_state & MKDIR_PARENT) {
9054                                 if (cancel_jaddref(jaddref, NULL,
9055                                     &dirrem->dm_jwork) == 0) {
9056                                         free_jremref(dotdotremref);
9057                                         dotdotremref = NULL;
9058                                 }
9059                         } else {
9060                                 if (cancel_jaddref(jaddref, inodedep,
9061                                     &dirrem->dm_jwork) == 0) {
9062                                         free_jremref(dotremref);
9063                                         dotremref = NULL;
9064                                 }
9065                         }
9066                 }
9067         }
9068
9069         if (jremref)
9070                 journal_jremref(dirrem, jremref, inodedep);
9071         if (dotremref)
9072                 journal_jremref(dirrem, dotremref, inodedep);
9073         if (dotdotremref)
9074                 journal_jremref(dirrem, dotdotremref, NULL);
9075         jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
9076         free_diradd(dap, &dirrem->dm_jwork);
9077 }
9078
9079 /*
9080  * Free a diradd dependency structure.
9081  */
9082 static void
9083 free_diradd(struct diradd *dap, struct workhead *wkhd)
9084 {
9085         struct dirrem *dirrem;
9086         struct pagedep *pagedep;
9087         struct inodedep *inodedep;
9088         struct mkdir *mkdir, *nextmd;
9089         struct ufsmount *ump;
9090
9091         ump = VFSTOUFS(dap->da_list.wk_mp);
9092         LOCK_OWNED(ump);
9093         LIST_REMOVE(dap, da_pdlist);
9094         if (dap->da_state & ONWORKLIST)
9095                 WORKLIST_REMOVE(&dap->da_list);
9096         if ((dap->da_state & DIRCHG) == 0) {
9097                 pagedep = dap->da_pagedep;
9098         } else {
9099                 dirrem = dap->da_previous;
9100                 pagedep = dirrem->dm_pagedep;
9101                 dirrem->dm_dirinum = pagedep->pd_ino;
9102                 dirrem->dm_state |= COMPLETE;
9103                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9104                         add_to_worklist(&dirrem->dm_list, 0);
9105         }
9106         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
9107             0, &inodedep) != 0)
9108                 if (inodedep->id_mkdiradd == dap)
9109                         inodedep->id_mkdiradd = NULL;
9110         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9111                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9112                      mkdir = nextmd) {
9113                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
9114                         if (mkdir->md_diradd != dap)
9115                                 continue;
9116                         dap->da_state &=
9117                             ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9118                         LIST_REMOVE(mkdir, md_mkdirs);
9119                         if (mkdir->md_state & ONWORKLIST)
9120                                 WORKLIST_REMOVE(&mkdir->md_list);
9121                         if (mkdir->md_jaddref != NULL)
9122                                 panic("free_diradd: Unexpected jaddref");
9123                         WORKITEM_FREE(mkdir, D_MKDIR);
9124                         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
9125                                 break;
9126                 }
9127                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
9128                         panic("free_diradd: unfound ref");
9129         }
9130         if (inodedep)
9131                 free_inodedep(inodedep);
9132         /*
9133          * Free any journal segments waiting for the directory write.
9134          */
9135         handle_jwork(&dap->da_jwork);
9136         WORKITEM_FREE(dap, D_DIRADD);
9137 }
9138
9139 /*
9140  * Directory entry removal dependencies.
9141  *
9142  * When removing a directory entry, the entry's inode pointer must be
9143  * zero'ed on disk before the corresponding inode's link count is decremented
9144  * (possibly freeing the inode for re-use). This dependency is handled by
9145  * updating the directory entry but delaying the inode count reduction until
9146  * after the directory block has been written to disk. After this point, the
9147  * inode count can be decremented whenever it is convenient.
9148  */
9149
9150 /*
9151  * This routine should be called immediately after removing
9152  * a directory entry.  The inode's link count should not be
9153  * decremented by the calling procedure -- the soft updates
9154  * code will do this task when it is safe.
9155  */
9156 void
9157 softdep_setup_remove(
9158         struct buf *bp,         /* buffer containing directory block */
9159         struct inode *dp,       /* inode for the directory being modified */
9160         struct inode *ip,       /* inode for directory entry being removed */
9161         int isrmdir)            /* indicates if doing RMDIR */
9162 {
9163         struct dirrem *dirrem, *prevdirrem;
9164         struct inodedep *inodedep;
9165         struct ufsmount *ump;
9166         int direct;
9167
9168         ump = ITOUMP(ip);
9169         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9170             ("softdep_setup_remove called on non-softdep filesystem"));
9171         /*
9172          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
9173          * newdirrem() to setup the full directory remove which requires
9174          * isrmdir > 1.
9175          */
9176         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9177         /*
9178          * Add the dirrem to the inodedep's pending remove list for quick
9179          * discovery later.
9180          */
9181         if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
9182                 panic("softdep_setup_remove: Lost inodedep.");
9183         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
9184         dirrem->dm_state |= ONDEPLIST;
9185         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9186
9187         /*
9188          * If the COMPLETE flag is clear, then there were no active
9189          * entries and we want to roll back to a zeroed entry until
9190          * the new inode is committed to disk. If the COMPLETE flag is
9191          * set then we have deleted an entry that never made it to
9192          * disk. If the entry we deleted resulted from a name change,
9193          * then the old name still resides on disk. We cannot delete
9194          * its inode (returned to us in prevdirrem) until the zeroed
9195          * directory entry gets to disk. The new inode has never been
9196          * referenced on the disk, so can be deleted immediately.
9197          */
9198         if ((dirrem->dm_state & COMPLETE) == 0) {
9199                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
9200                     dm_next);
9201                 FREE_LOCK(ump);
9202         } else {
9203                 if (prevdirrem != NULL)
9204                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
9205                             prevdirrem, dm_next);
9206                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
9207                 direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
9208                 FREE_LOCK(ump);
9209                 if (direct)
9210                         handle_workitem_remove(dirrem, 0);
9211         }
9212 }
9213
9214 /*
9215  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
9216  * pd_pendinghd list of a pagedep.
9217  */
9218 static struct diradd *
9219 diradd_lookup(struct pagedep *pagedep, int offset)
9220 {
9221         struct diradd *dap;
9222
9223         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
9224                 if (dap->da_offset == offset)
9225                         return (dap);
9226         LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
9227                 if (dap->da_offset == offset)
9228                         return (dap);
9229         return (NULL);
9230 }
9231
9232 /*
9233  * Search for a .. diradd dependency in a directory that is being removed.
9234  * If the directory was renamed to a new parent we have a diradd rather
9235  * than a mkdir for the .. entry.  We need to cancel it now before
9236  * it is found in truncate().
9237  */
9238 static struct jremref *
9239 cancel_diradd_dotdot(struct inode *ip,
9240         struct dirrem *dirrem,
9241         struct jremref *jremref)
9242 {
9243         struct pagedep *pagedep;
9244         struct diradd *dap;
9245         struct worklist *wk;
9246
9247         if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
9248                 return (jremref);
9249         dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
9250         if (dap == NULL)
9251                 return (jremref);
9252         cancel_diradd(dap, dirrem, jremref, NULL, NULL);
9253         /*
9254          * Mark any journal work as belonging to the parent so it is freed
9255          * with the .. reference.
9256          */
9257         LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9258                 wk->wk_state |= MKDIR_PARENT;
9259         return (NULL);
9260 }
9261
9262 /*
9263  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9264  * replace it with a dirrem/diradd pair as a result of re-parenting a
9265  * directory.  This ensures that we don't simultaneously have a mkdir and
9266  * a diradd for the same .. entry.
9267  */
9268 static struct jremref *
9269 cancel_mkdir_dotdot(struct inode *ip,
9270         struct dirrem *dirrem,
9271         struct jremref *jremref)
9272 {
9273         struct inodedep *inodedep;
9274         struct jaddref *jaddref;
9275         struct ufsmount *ump;
9276         struct mkdir *mkdir;
9277         struct diradd *dap;
9278         struct mount *mp;
9279
9280         mp = ITOVFS(ip);
9281         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9282                 return (jremref);
9283         dap = inodedep->id_mkdiradd;
9284         if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9285                 return (jremref);
9286         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9287         for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9288             mkdir = LIST_NEXT(mkdir, md_mkdirs))
9289                 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9290                         break;
9291         if (mkdir == NULL)
9292                 panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9293         if ((jaddref = mkdir->md_jaddref) != NULL) {
9294                 mkdir->md_jaddref = NULL;
9295                 jaddref->ja_state &= ~MKDIR_PARENT;
9296                 if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
9297                         panic("cancel_mkdir_dotdot: Lost parent inodedep");
9298                 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9299                         journal_jremref(dirrem, jremref, inodedep);
9300                         jremref = NULL;
9301                 }
9302         }
9303         if (mkdir->md_state & ONWORKLIST)
9304                 WORKLIST_REMOVE(&mkdir->md_list);
9305         mkdir->md_state |= ALLCOMPLETE;
9306         complete_mkdir(mkdir);
9307         return (jremref);
9308 }
9309
9310 static void
9311 journal_jremref(struct dirrem *dirrem,
9312         struct jremref *jremref,
9313         struct inodedep *inodedep)
9314 {
9315
9316         if (inodedep == NULL)
9317                 if (inodedep_lookup(jremref->jr_list.wk_mp,
9318                     jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9319                         panic("journal_jremref: Lost inodedep");
9320         LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9321         TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9322         add_to_journal(&jremref->jr_list);
9323 }
9324
9325 static void
9326 dirrem_journal(
9327         struct dirrem *dirrem,
9328         struct jremref *jremref,
9329         struct jremref *dotremref,
9330         struct jremref *dotdotremref)
9331 {
9332         struct inodedep *inodedep;
9333
9334         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9335             &inodedep) == 0)
9336                 panic("dirrem_journal: Lost inodedep");
9337         journal_jremref(dirrem, jremref, inodedep);
9338         if (dotremref)
9339                 journal_jremref(dirrem, dotremref, inodedep);
9340         if (dotdotremref)
9341                 journal_jremref(dirrem, dotdotremref, NULL);
9342 }
9343
9344 /*
9345  * Allocate a new dirrem if appropriate and return it along with
9346  * its associated pagedep. Called without a lock, returns with lock.
9347  */
9348 static struct dirrem *
9349 newdirrem(
9350         struct buf *bp,         /* buffer containing directory block */
9351         struct inode *dp,       /* inode for the directory being modified */
9352         struct inode *ip,       /* inode for directory entry being removed */
9353         int isrmdir,            /* indicates if doing RMDIR */
9354         struct dirrem **prevdirremp) /* previously referenced inode, if any */
9355 {
9356         int offset;
9357         ufs_lbn_t lbn;
9358         struct diradd *dap;
9359         struct dirrem *dirrem;
9360         struct pagedep *pagedep;
9361         struct jremref *jremref;
9362         struct jremref *dotremref;
9363         struct jremref *dotdotremref;
9364         struct vnode *dvp;
9365         struct ufsmount *ump;
9366
9367         /*
9368          * Whiteouts have no deletion dependencies.
9369          */
9370         if (ip == NULL)
9371                 panic("newdirrem: whiteout");
9372         dvp = ITOV(dp);
9373         ump = ITOUMP(dp);
9374
9375         /*
9376          * If the system is over its limit and our filesystem is
9377          * responsible for more than our share of that usage and
9378          * we are not a snapshot, request some inodedep cleanup.
9379          * Limiting the number of dirrem structures will also limit
9380          * the number of freefile and freeblks structures.
9381          */
9382         ACQUIRE_LOCK(ump);
9383         if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
9384                 schedule_cleanup(UFSTOVFS(ump));
9385         else
9386                 FREE_LOCK(ump);
9387         dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9388             M_ZERO);
9389         workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9390         LIST_INIT(&dirrem->dm_jremrefhd);
9391         LIST_INIT(&dirrem->dm_jwork);
9392         dirrem->dm_state = isrmdir ? RMDIR : 0;
9393         dirrem->dm_oldinum = ip->i_number;
9394         *prevdirremp = NULL;
9395         /*
9396          * Allocate remove reference structures to track journal write
9397          * dependencies.  We will always have one for the link and
9398          * when doing directories we will always have one more for dot.
9399          * When renaming a directory we skip the dotdot link change so
9400          * this is not needed.
9401          */
9402         jremref = dotremref = dotdotremref = NULL;
9403         if (DOINGSUJ(dvp)) {
9404                 if (isrmdir) {
9405                         jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp),
9406                             ip->i_effnlink + 2);
9407                         dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9408                             ip->i_effnlink + 1);
9409                         dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9410                             dp->i_effnlink + 1);
9411                         dotdotremref->jr_state |= MKDIR_PARENT;
9412                 } else
9413                         jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp),
9414                             ip->i_effnlink + 1);
9415         }
9416         ACQUIRE_LOCK(ump);
9417         lbn = lblkno(ump->um_fs, I_OFFSET(dp));
9418         offset = blkoff(ump->um_fs, I_OFFSET(dp));
9419         pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
9420             &pagedep);
9421         dirrem->dm_pagedep = pagedep;
9422         dirrem->dm_offset = offset;
9423         /*
9424          * If we're renaming a .. link to a new directory, cancel any
9425          * existing MKDIR_PARENT mkdir.  If it has already been canceled
9426          * the jremref is preserved for any potential diradd in this
9427          * location.  This can not coincide with a rmdir.
9428          */
9429         if (I_OFFSET(dp) == DOTDOT_OFFSET) {
9430                 if (isrmdir)
9431                         panic("newdirrem: .. directory change during remove?");
9432                 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9433         }
9434         /*
9435          * If we're removing a directory search for the .. dependency now and
9436          * cancel it.  Any pending journal work will be added to the dirrem
9437          * to be completed when the workitem remove completes.
9438          */
9439         if (isrmdir)
9440                 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9441         /*
9442          * Check for a diradd dependency for the same directory entry.
9443          * If present, then both dependencies become obsolete and can
9444          * be de-allocated.
9445          */
9446         dap = diradd_lookup(pagedep, offset);
9447         if (dap == NULL) {
9448                 /*
9449                  * Link the jremref structures into the dirrem so they are
9450                  * written prior to the pagedep.
9451                  */
9452                 if (jremref)
9453                         dirrem_journal(dirrem, jremref, dotremref,
9454                             dotdotremref);
9455                 return (dirrem);
9456         }
9457         /*
9458          * Must be ATTACHED at this point.
9459          */
9460         if ((dap->da_state & ATTACHED) == 0)
9461                 panic("newdirrem: not ATTACHED");
9462         if (dap->da_newinum != ip->i_number)
9463                 panic("newdirrem: inum %ju should be %ju",
9464                     (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9465         /*
9466          * If we are deleting a changed name that never made it to disk,
9467          * then return the dirrem describing the previous inode (which
9468          * represents the inode currently referenced from this entry on disk).
9469          */
9470         if ((dap->da_state & DIRCHG) != 0) {
9471                 *prevdirremp = dap->da_previous;
9472                 dap->da_state &= ~DIRCHG;
9473                 dap->da_pagedep = pagedep;
9474         }
9475         /*
9476          * We are deleting an entry that never made it to disk.
9477          * Mark it COMPLETE so we can delete its inode immediately.
9478          */
9479         dirrem->dm_state |= COMPLETE;
9480         cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9481 #ifdef INVARIANTS
9482         if (isrmdir == 0) {
9483                 struct worklist *wk;
9484
9485                 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9486                         if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9487                                 panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9488         }
9489 #endif
9490
9491         return (dirrem);
9492 }
9493
9494 /*
9495  * Directory entry change dependencies.
9496  *
9497  * Changing an existing directory entry requires that an add operation
9498  * be completed first followed by a deletion. The semantics for the addition
9499  * are identical to the description of adding a new entry above except
9500  * that the rollback is to the old inode number rather than zero. Once
9501  * the addition dependency is completed, the removal is done as described
9502  * in the removal routine above.
9503  */
9504
9505 /*
9506  * This routine should be called immediately after changing
9507  * a directory entry.  The inode's link count should not be
9508  * decremented by the calling procedure -- the soft updates
9509  * code will perform this task when it is safe.
9510  */
9511 void
9512 softdep_setup_directory_change(
9513         struct buf *bp,         /* buffer containing directory block */
9514         struct inode *dp,       /* inode for the directory being modified */
9515         struct inode *ip,       /* inode for directory entry being removed */
9516         ino_t newinum,          /* new inode number for changed entry */
9517         int isrmdir)            /* indicates if doing RMDIR */
9518 {
9519         int offset;
9520         struct diradd *dap = NULL;
9521         struct dirrem *dirrem, *prevdirrem;
9522         struct pagedep *pagedep;
9523         struct inodedep *inodedep;
9524         struct jaddref *jaddref;
9525         struct mount *mp;
9526         struct ufsmount *ump;
9527
9528         mp = ITOVFS(dp);
9529         ump = VFSTOUFS(mp);
9530         offset = blkoff(ump->um_fs, I_OFFSET(dp));
9531         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9532            ("softdep_setup_directory_change called on non-softdep filesystem"));
9533
9534         /*
9535          * Whiteouts do not need diradd dependencies.
9536          */
9537         if (newinum != UFS_WINO) {
9538                 dap = malloc(sizeof(struct diradd),
9539                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9540                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
9541                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9542                 dap->da_offset = offset;
9543                 dap->da_newinum = newinum;
9544                 LIST_INIT(&dap->da_jwork);
9545         }
9546
9547         /*
9548          * Allocate a new dirrem and ACQUIRE_LOCK.
9549          */
9550         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9551         pagedep = dirrem->dm_pagedep;
9552         /*
9553          * The possible values for isrmdir:
9554          *      0 - non-directory file rename
9555          *      1 - directory rename within same directory
9556          *   inum - directory rename to new directory of given inode number
9557          * When renaming to a new directory, we are both deleting and
9558          * creating a new directory entry, so the link count on the new
9559          * directory should not change. Thus we do not need the followup
9560          * dirrem which is usually done in handle_workitem_remove. We set
9561          * the DIRCHG flag to tell handle_workitem_remove to skip the
9562          * followup dirrem.
9563          */
9564         if (isrmdir > 1)
9565                 dirrem->dm_state |= DIRCHG;
9566
9567         /*
9568          * Whiteouts have no additional dependencies,
9569          * so just put the dirrem on the correct list.
9570          */
9571         if (newinum == UFS_WINO) {
9572                 if ((dirrem->dm_state & COMPLETE) == 0) {
9573                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9574                             dm_next);
9575                 } else {
9576                         dirrem->dm_dirinum = pagedep->pd_ino;
9577                         if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9578                                 add_to_worklist(&dirrem->dm_list, 0);
9579                 }
9580                 FREE_LOCK(ump);
9581                 return;
9582         }
9583         /*
9584          * Add the dirrem to the inodedep's pending remove list for quick
9585          * discovery later.  A valid nlinkdelta ensures that this lookup
9586          * will not fail.
9587          */
9588         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9589                 panic("softdep_setup_directory_change: Lost inodedep.");
9590         dirrem->dm_state |= ONDEPLIST;
9591         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9592
9593         /*
9594          * If the COMPLETE flag is clear, then there were no active
9595          * entries and we want to roll back to the previous inode until
9596          * the new inode is committed to disk. If the COMPLETE flag is
9597          * set, then we have deleted an entry that never made it to disk.
9598          * If the entry we deleted resulted from a name change, then the old
9599          * inode reference still resides on disk. Any rollback that we do
9600          * needs to be to that old inode (returned to us in prevdirrem). If
9601          * the entry we deleted resulted from a create, then there is
9602          * no entry on the disk, so we want to roll back to zero rather
9603          * than the uncommitted inode. In either of the COMPLETE cases we
9604          * want to immediately free the unwritten and unreferenced inode.
9605          */
9606         if ((dirrem->dm_state & COMPLETE) == 0) {
9607                 dap->da_previous = dirrem;
9608         } else {
9609                 if (prevdirrem != NULL) {
9610                         dap->da_previous = prevdirrem;
9611                 } else {
9612                         dap->da_state &= ~DIRCHG;
9613                         dap->da_pagedep = pagedep;
9614                 }
9615                 dirrem->dm_dirinum = pagedep->pd_ino;
9616                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9617                         add_to_worklist(&dirrem->dm_list, 0);
9618         }
9619         /*
9620          * Lookup the jaddref for this journal entry.  We must finish
9621          * initializing it and make the diradd write dependent on it.
9622          * If we're not journaling, put it on the id_bufwait list if the
9623          * inode is not yet written. If it is written, do the post-inode
9624          * write processing to put it on the id_pendinghd list.
9625          */
9626         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9627         if (MOUNTEDSUJ(mp)) {
9628                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9629                     inoreflst);
9630                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9631                     ("softdep_setup_directory_change: bad jaddref %p",
9632                     jaddref));
9633                 jaddref->ja_diroff = I_OFFSET(dp);
9634                 jaddref->ja_diradd = dap;
9635                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9636                     dap, da_pdlist);
9637                 add_to_journal(&jaddref->ja_list);
9638         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9639                 dap->da_state |= COMPLETE;
9640                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9641                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9642         } else {
9643                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9644                     dap, da_pdlist);
9645                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9646         }
9647         /*
9648          * If we're making a new name for a directory that has not been
9649          * committed when need to move the dot and dotdot references to
9650          * this new name.
9651          */
9652         if (inodedep->id_mkdiradd && I_OFFSET(dp) != DOTDOT_OFFSET)
9653                 merge_diradd(inodedep, dap);
9654         FREE_LOCK(ump);
9655 }
9656
9657 /*
9658  * Called whenever the link count on an inode is changed.
9659  * It creates an inode dependency so that the new reference(s)
9660  * to the inode cannot be committed to disk until the updated
9661  * inode has been written.
9662  */
9663 void
9664 softdep_change_linkcnt(
9665         struct inode *ip)       /* the inode with the increased link count */
9666 {
9667         struct inodedep *inodedep;
9668         struct ufsmount *ump;
9669
9670         ump = ITOUMP(ip);
9671         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9672             ("softdep_change_linkcnt called on non-softdep filesystem"));
9673         ACQUIRE_LOCK(ump);
9674         inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
9675         if (ip->i_nlink < ip->i_effnlink)
9676                 panic("softdep_change_linkcnt: bad delta");
9677         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9678         FREE_LOCK(ump);
9679 }
9680
9681 /*
9682  * Attach a sbdep dependency to the superblock buf so that we can keep
9683  * track of the head of the linked list of referenced but unlinked inodes.
9684  */
9685 void
9686 softdep_setup_sbupdate(
9687         struct ufsmount *ump,
9688         struct fs *fs,
9689         struct buf *bp)
9690 {
9691         struct sbdep *sbdep;
9692         struct worklist *wk;
9693
9694         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9695             ("softdep_setup_sbupdate called on non-softdep filesystem"));
9696         LIST_FOREACH(wk, &bp->b_dep, wk_list)
9697                 if (wk->wk_type == D_SBDEP)
9698                         break;
9699         if (wk != NULL)
9700                 return;
9701         sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9702         workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9703         sbdep->sb_fs = fs;
9704         sbdep->sb_ump = ump;
9705         ACQUIRE_LOCK(ump);
9706         WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9707         FREE_LOCK(ump);
9708 }
9709
9710 /*
9711  * Return the first unlinked inodedep which is ready to be the head of the
9712  * list.  The inodedep and all those after it must have valid next pointers.
9713  */
9714 static struct inodedep *
9715 first_unlinked_inodedep(struct ufsmount *ump)
9716 {
9717         struct inodedep *inodedep;
9718         struct inodedep *idp;
9719
9720         LOCK_OWNED(ump);
9721         for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9722             inodedep; inodedep = idp) {
9723                 if ((inodedep->id_state & UNLINKNEXT) == 0)
9724                         return (NULL);
9725                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9726                 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9727                         break;
9728                 if ((inodedep->id_state & UNLINKPREV) == 0)
9729                         break;
9730         }
9731         return (inodedep);
9732 }
9733
9734 /*
9735  * Set the sujfree unlinked head pointer prior to writing a superblock.
9736  */
9737 static void
9738 initiate_write_sbdep(struct sbdep *sbdep)
9739 {
9740         struct inodedep *inodedep;
9741         struct fs *bpfs;
9742         struct fs *fs;
9743
9744         bpfs = sbdep->sb_fs;
9745         fs = sbdep->sb_ump->um_fs;
9746         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9747         if (inodedep) {
9748                 fs->fs_sujfree = inodedep->id_ino;
9749                 inodedep->id_state |= UNLINKPREV;
9750         } else
9751                 fs->fs_sujfree = 0;
9752         bpfs->fs_sujfree = fs->fs_sujfree;
9753         /*
9754          * Because we have made changes to the superblock, we need to
9755          * recompute its check-hash.
9756          */
9757         bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9758 }
9759
9760 /*
9761  * After a superblock is written determine whether it must be written again
9762  * due to a changing unlinked list head.
9763  */
9764 static int
9765 handle_written_sbdep(struct sbdep *sbdep, struct buf *bp)
9766 {
9767         struct inodedep *inodedep;
9768         struct fs *fs;
9769
9770         LOCK_OWNED(sbdep->sb_ump);
9771         fs = sbdep->sb_fs;
9772         /*
9773          * If the superblock doesn't match the in-memory list start over.
9774          */
9775         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9776         if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9777             (inodedep == NULL && fs->fs_sujfree != 0)) {
9778                 bdirty(bp);
9779                 return (1);
9780         }
9781         WORKITEM_FREE(sbdep, D_SBDEP);
9782         if (fs->fs_sujfree == 0)
9783                 return (0);
9784         /*
9785          * Now that we have a record of this inode in stable store allow it
9786          * to be written to free up pending work.  Inodes may see a lot of
9787          * write activity after they are unlinked which we must not hold up.
9788          */
9789         for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9790                 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9791                         panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9792                             inodedep, inodedep->id_state);
9793                 if (inodedep->id_state & UNLINKONLIST)
9794                         break;
9795                 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9796         }
9797
9798         return (0);
9799 }
9800
9801 /*
9802  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9803  */
9804 static void
9805 unlinked_inodedep( struct mount *mp, struct inodedep *inodedep)
9806 {
9807         struct ufsmount *ump;
9808
9809         ump = VFSTOUFS(mp);
9810         LOCK_OWNED(ump);
9811         if (MOUNTEDSUJ(mp) == 0)
9812                 return;
9813         ump->um_fs->fs_fmod = 1;
9814         if (inodedep->id_state & UNLINKED)
9815                 panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9816         inodedep->id_state |= UNLINKED;
9817         TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9818 }
9819
9820 /*
9821  * Remove an inodedep from the unlinked inodedep list.  This may require
9822  * disk writes if the inode has made it that far.
9823  */
9824 static void
9825 clear_unlinked_inodedep( struct inodedep *inodedep)
9826 {
9827         struct ufs2_dinode *dip;
9828         struct ufsmount *ump;
9829         struct inodedep *idp;
9830         struct inodedep *idn;
9831         struct fs *fs, *bpfs;
9832         struct buf *bp;
9833         daddr_t dbn;
9834         ino_t ino;
9835         ino_t nino;
9836         ino_t pino;
9837         int error;
9838
9839         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9840         fs = ump->um_fs;
9841         ino = inodedep->id_ino;
9842         error = 0;
9843         for (;;) {
9844                 LOCK_OWNED(ump);
9845                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9846                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9847                     inodedep));
9848                 /*
9849                  * If nothing has yet been written simply remove us from
9850                  * the in memory list and return.  This is the most common
9851                  * case where handle_workitem_remove() loses the final
9852                  * reference.
9853                  */
9854                 if ((inodedep->id_state & UNLINKLINKS) == 0)
9855                         break;
9856                 /*
9857                  * If we have a NEXT pointer and no PREV pointer we can simply
9858                  * clear NEXT's PREV and remove ourselves from the list.  Be
9859                  * careful not to clear PREV if the superblock points at
9860                  * next as well.
9861                  */
9862                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9863                 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9864                         if (idn && fs->fs_sujfree != idn->id_ino)
9865                                 idn->id_state &= ~UNLINKPREV;
9866                         break;
9867                 }
9868                 /*
9869                  * Here we have an inodedep which is actually linked into
9870                  * the list.  We must remove it by forcing a write to the
9871                  * link before us, whether it be the superblock or an inode.
9872                  * Unfortunately the list may change while we're waiting
9873                  * on the buf lock for either resource so we must loop until
9874                  * we lock the right one.  If both the superblock and an
9875                  * inode point to this inode we must clear the inode first
9876                  * followed by the superblock.
9877                  */
9878                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9879                 pino = 0;
9880                 if (idp && (idp->id_state & UNLINKNEXT))
9881                         pino = idp->id_ino;
9882                 FREE_LOCK(ump);
9883                 if (pino == 0) {
9884                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9885                             (int)fs->fs_sbsize, 0, 0, 0);
9886                 } else {
9887                         dbn = fsbtodb(fs, ino_to_fsba(fs, pino));
9888                         error = ffs_breadz(ump, ump->um_devvp, dbn, dbn,
9889                             (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL,
9890                             &bp);
9891                 }
9892                 ACQUIRE_LOCK(ump);
9893                 if (error)
9894                         break;
9895                 /* If the list has changed restart the loop. */
9896                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9897                 nino = 0;
9898                 if (idp && (idp->id_state & UNLINKNEXT))
9899                         nino = idp->id_ino;
9900                 if (nino != pino ||
9901                     (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9902                         FREE_LOCK(ump);
9903                         brelse(bp);
9904                         ACQUIRE_LOCK(ump);
9905                         continue;
9906                 }
9907                 nino = 0;
9908                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9909                 if (idn)
9910                         nino = idn->id_ino;
9911                 /*
9912                  * Remove us from the in memory list.  After this we cannot
9913                  * access the inodedep.
9914                  */
9915                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9916                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9917                     inodedep));
9918                 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9919                 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9920                 FREE_LOCK(ump);
9921                 /*
9922                  * The predecessor's next pointer is manually updated here
9923                  * so that the NEXT flag is never cleared for an element
9924                  * that is in the list.
9925                  */
9926                 if (pino == 0) {
9927                         bcopy((caddr_t)fs, bp->b_data, (uint64_t)fs->fs_sbsize);
9928                         bpfs = (struct fs *)bp->b_data;
9929                         ffs_oldfscompat_write(bpfs, ump);
9930                         softdep_setup_sbupdate(ump, bpfs, bp);
9931                         /*
9932                          * Because we may have made changes to the superblock,
9933                          * we need to recompute its check-hash.
9934                          */
9935                         bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9936                 } else if (fs->fs_magic == FS_UFS1_MAGIC) {
9937                         ((struct ufs1_dinode *)bp->b_data +
9938                             ino_to_fsbo(fs, pino))->di_freelink = nino;
9939                 } else {
9940                         dip = (struct ufs2_dinode *)bp->b_data +
9941                             ino_to_fsbo(fs, pino);
9942                         dip->di_freelink = nino;
9943                         ffs_update_dinode_ckhash(fs, dip);
9944                 }
9945                 /*
9946                  * If the bwrite fails we have no recourse to recover.  The
9947                  * filesystem is corrupted already.
9948                  */
9949                 bwrite(bp);
9950                 ACQUIRE_LOCK(ump);
9951                 /*
9952                  * If the superblock pointer still needs to be cleared force
9953                  * a write here.
9954                  */
9955                 if (fs->fs_sujfree == ino) {
9956                         FREE_LOCK(ump);
9957                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9958                             (int)fs->fs_sbsize, 0, 0, 0);
9959                         bcopy((caddr_t)fs, bp->b_data, (uint64_t)fs->fs_sbsize);
9960                         bpfs = (struct fs *)bp->b_data;
9961                         ffs_oldfscompat_write(bpfs, ump);
9962                         softdep_setup_sbupdate(ump, bpfs, bp);
9963                         /*
9964                          * Because we may have made changes to the superblock,
9965                          * we need to recompute its check-hash.
9966                          */
9967                         bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9968                         bwrite(bp);
9969                         ACQUIRE_LOCK(ump);
9970                 }
9971
9972                 if (fs->fs_sujfree != ino)
9973                         return;
9974                 panic("clear_unlinked_inodedep: Failed to clear free head");
9975         }
9976         if (inodedep->id_ino == fs->fs_sujfree)
9977                 panic("clear_unlinked_inodedep: Freeing head of free list");
9978         inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9979         TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9980         return;
9981 }
9982
9983 /*
9984  * This workitem decrements the inode's link count.
9985  * If the link count reaches zero, the file is removed.
9986  */
9987 static int
9988 handle_workitem_remove(struct dirrem *dirrem, int flags)
9989 {
9990         struct inodedep *inodedep;
9991         struct workhead dotdotwk;
9992         struct worklist *wk;
9993         struct ufsmount *ump;
9994         struct mount *mp;
9995         struct vnode *vp;
9996         struct inode *ip;
9997         ino_t oldinum;
9998
9999         if (dirrem->dm_state & ONWORKLIST)
10000                 panic("handle_workitem_remove: dirrem %p still on worklist",
10001                     dirrem);
10002         oldinum = dirrem->dm_oldinum;
10003         mp = dirrem->dm_list.wk_mp;
10004         ump = VFSTOUFS(mp);
10005         flags |= LK_EXCLUSIVE;
10006         if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ |
10007             FFSV_FORCEINODEDEP) != 0)
10008                 return (EBUSY);
10009         ip = VTOI(vp);
10010         MPASS(ip->i_mode != 0);
10011         ACQUIRE_LOCK(ump);
10012         if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
10013                 panic("handle_workitem_remove: lost inodedep");
10014         if (dirrem->dm_state & ONDEPLIST)
10015                 LIST_REMOVE(dirrem, dm_inonext);
10016         KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
10017             ("handle_workitem_remove:  Journal entries not written."));
10018
10019         /*
10020          * Move all dependencies waiting on the remove to complete
10021          * from the dirrem to the inode inowait list to be completed
10022          * after the inode has been updated and written to disk.
10023          *
10024          * Any marked MKDIR_PARENT are saved to be completed when the
10025          * dotdot ref is removed unless DIRCHG is specified.  For
10026          * directory change operations there will be no further
10027          * directory writes and the jsegdeps need to be moved along
10028          * with the rest to be completed when the inode is free or
10029          * stable in the inode free list.
10030          */
10031         LIST_INIT(&dotdotwk);
10032         while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
10033                 WORKLIST_REMOVE(wk);
10034                 if ((dirrem->dm_state & DIRCHG) == 0 &&
10035                     wk->wk_state & MKDIR_PARENT) {
10036                         wk->wk_state &= ~MKDIR_PARENT;
10037                         WORKLIST_INSERT(&dotdotwk, wk);
10038                         continue;
10039                 }
10040                 WORKLIST_INSERT(&inodedep->id_inowait, wk);
10041         }
10042         LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
10043         /*
10044          * Normal file deletion.
10045          */
10046         if ((dirrem->dm_state & RMDIR) == 0) {
10047                 ip->i_nlink--;
10048                 KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino "
10049                     "%ju negative i_nlink %d", (intmax_t)ip->i_number,
10050                     ip->i_nlink));
10051                 DIP_SET_NLINK(ip, ip->i_nlink);
10052                 UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10053                 if (ip->i_nlink < ip->i_effnlink)
10054                         panic("handle_workitem_remove: bad file delta");
10055                 if (ip->i_nlink == 0)
10056                         unlinked_inodedep(mp, inodedep);
10057                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
10058                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
10059                     ("handle_workitem_remove: worklist not empty. %s",
10060                     TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
10061                 WORKITEM_FREE(dirrem, D_DIRREM);
10062                 FREE_LOCK(ump);
10063                 goto out;
10064         }
10065         /*
10066          * Directory deletion. Decrement reference count for both the
10067          * just deleted parent directory entry and the reference for ".".
10068          * Arrange to have the reference count on the parent decremented
10069          * to account for the loss of "..".
10070          */
10071         ip->i_nlink -= 2;
10072         KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino "
10073             "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink));
10074         DIP_SET_NLINK(ip, ip->i_nlink);
10075         UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10076         if (ip->i_nlink < ip->i_effnlink)
10077                 panic("handle_workitem_remove: bad dir delta");
10078         if (ip->i_nlink == 0)
10079                 unlinked_inodedep(mp, inodedep);
10080         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
10081         /*
10082          * Rename a directory to a new parent. Since, we are both deleting
10083          * and creating a new directory entry, the link count on the new
10084          * directory should not change. Thus we skip the followup dirrem.
10085          */
10086         if (dirrem->dm_state & DIRCHG) {
10087                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
10088                     ("handle_workitem_remove: DIRCHG and worklist not empty."));
10089                 WORKITEM_FREE(dirrem, D_DIRREM);
10090                 FREE_LOCK(ump);
10091                 goto out;
10092         }
10093         dirrem->dm_state = ONDEPLIST;
10094         dirrem->dm_oldinum = dirrem->dm_dirinum;
10095         /*
10096          * Place the dirrem on the parent's diremhd list.
10097          */
10098         if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
10099                 panic("handle_workitem_remove: lost dir inodedep");
10100         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
10101         /*
10102          * If the allocated inode has never been written to disk, then
10103          * the on-disk inode is zero'ed and we can remove the file
10104          * immediately.  When journaling if the inode has been marked
10105          * unlinked and not DEPCOMPLETE we know it can never be written.
10106          */
10107         inodedep_lookup(mp, oldinum, 0, &inodedep);
10108         if (inodedep == NULL ||
10109             (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
10110             check_inode_unwritten(inodedep)) {
10111                 FREE_LOCK(ump);
10112                 vput(vp);
10113                 return handle_workitem_remove(dirrem, flags);
10114         }
10115         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
10116         FREE_LOCK(ump);
10117         UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10118 out:
10119         ffs_update(vp, 0);
10120         vput(vp);
10121         return (0);
10122 }
10123
10124 /*
10125  * Inode de-allocation dependencies.
10126  *
10127  * When an inode's link count is reduced to zero, it can be de-allocated. We
10128  * found it convenient to postpone de-allocation until after the inode is
10129  * written to disk with its new link count (zero).  At this point, all of the
10130  * on-disk inode's block pointers are nullified and, with careful dependency
10131  * list ordering, all dependencies related to the inode will be satisfied and
10132  * the corresponding dependency structures de-allocated.  So, if/when the
10133  * inode is reused, there will be no mixing of old dependencies with new
10134  * ones.  This artificial dependency is set up by the block de-allocation
10135  * procedure above (softdep_setup_freeblocks) and completed by the
10136  * following procedure.
10137  */
10138 static void
10139 handle_workitem_freefile(struct freefile *freefile)
10140 {
10141         struct workhead wkhd;
10142         struct fs *fs;
10143         struct ufsmount *ump;
10144         int error;
10145 #ifdef INVARIANTS
10146         struct inodedep *idp;
10147 #endif
10148
10149         ump = VFSTOUFS(freefile->fx_list.wk_mp);
10150         fs = ump->um_fs;
10151 #ifdef INVARIANTS
10152         ACQUIRE_LOCK(ump);
10153         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
10154         FREE_LOCK(ump);
10155         if (error)
10156                 panic("handle_workitem_freefile: inodedep %p survived", idp);
10157 #endif
10158         UFS_LOCK(ump);
10159         fs->fs_pendinginodes -= 1;
10160         UFS_UNLOCK(ump);
10161         LIST_INIT(&wkhd);
10162         LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
10163         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
10164             freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
10165                 softdep_error("handle_workitem_freefile", error);
10166         ACQUIRE_LOCK(ump);
10167         WORKITEM_FREE(freefile, D_FREEFILE);
10168         FREE_LOCK(ump);
10169 }
10170
10171 /*
10172  * Helper function which unlinks marker element from work list and returns
10173  * the next element on the list.
10174  */
10175 static __inline struct worklist *
10176 markernext(struct worklist *marker)
10177 {
10178         struct worklist *next;
10179
10180         next = LIST_NEXT(marker, wk_list);
10181         LIST_REMOVE(marker, wk_list);
10182         return next;
10183 }
10184
10185 /*
10186  * Disk writes.
10187  *
10188  * The dependency structures constructed above are most actively used when file
10189  * system blocks are written to disk.  No constraints are placed on when a
10190  * block can be written, but unsatisfied update dependencies are made safe by
10191  * modifying (or replacing) the source memory for the duration of the disk
10192  * write.  When the disk write completes, the memory block is again brought
10193  * up-to-date.
10194  *
10195  * In-core inode structure reclamation.
10196  *
10197  * Because there are a finite number of "in-core" inode structures, they are
10198  * reused regularly.  By transferring all inode-related dependencies to the
10199  * in-memory inode block and indexing them separately (via "inodedep"s), we
10200  * can allow "in-core" inode structures to be reused at any time and avoid
10201  * any increase in contention.
10202  *
10203  * Called just before entering the device driver to initiate a new disk I/O.
10204  * The buffer must be locked, thus, no I/O completion operations can occur
10205  * while we are manipulating its associated dependencies.
10206  */
10207 static void
10208 softdep_disk_io_initiation(
10209         struct buf *bp)         /* structure describing disk write to occur */
10210 {
10211         struct worklist *wk;
10212         struct worklist marker;
10213         struct inodedep *inodedep;
10214         struct freeblks *freeblks;
10215         struct jblkdep *jblkdep;
10216         struct newblk *newblk;
10217         struct ufsmount *ump;
10218
10219         /*
10220          * We only care about write operations. There should never
10221          * be dependencies for reads.
10222          */
10223         if (bp->b_iocmd != BIO_WRITE)
10224                 panic("softdep_disk_io_initiation: not write");
10225
10226         if (bp->b_vflags & BV_BKGRDINPROG)
10227                 panic("softdep_disk_io_initiation: Writing buffer with "
10228                     "background write in progress: %p", bp);
10229
10230         ump = softdep_bp_to_mp(bp);
10231         if (ump == NULL)
10232                 return;
10233
10234         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
10235         PHOLD(curproc);                 /* Don't swap out kernel stack */
10236         ACQUIRE_LOCK(ump);
10237         /*
10238          * Do any necessary pre-I/O processing.
10239          */
10240         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
10241              wk = markernext(&marker)) {
10242                 LIST_INSERT_AFTER(wk, &marker, wk_list);
10243                 switch (wk->wk_type) {
10244                 case D_PAGEDEP:
10245                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
10246                         continue;
10247
10248                 case D_INODEDEP:
10249                         inodedep = WK_INODEDEP(wk);
10250                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
10251                                 initiate_write_inodeblock_ufs1(inodedep, bp);
10252                         else
10253                                 initiate_write_inodeblock_ufs2(inodedep, bp);
10254                         continue;
10255
10256                 case D_INDIRDEP:
10257                         initiate_write_indirdep(WK_INDIRDEP(wk), bp);
10258                         continue;
10259
10260                 case D_BMSAFEMAP:
10261                         initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
10262                         continue;
10263
10264                 case D_JSEG:
10265                         WK_JSEG(wk)->js_buf = NULL;
10266                         continue;
10267
10268                 case D_FREEBLKS:
10269                         freeblks = WK_FREEBLKS(wk);
10270                         jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10271                         /*
10272                          * We have to wait for the freeblks to be journaled
10273                          * before we can write an inodeblock with updated
10274                          * pointers.  Be careful to arrange the marker so
10275                          * we revisit the freeblks if it's not removed by
10276                          * the first jwait().
10277                          */
10278                         if (jblkdep != NULL) {
10279                                 LIST_REMOVE(&marker, wk_list);
10280                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10281                                 jwait(&jblkdep->jb_list, MNT_WAIT);
10282                         }
10283                         continue;
10284                 case D_ALLOCDIRECT:
10285                 case D_ALLOCINDIR:
10286                         /*
10287                          * We have to wait for the jnewblk to be journaled
10288                          * before we can write to a block if the contents
10289                          * may be confused with an earlier file's indirect
10290                          * at recovery time.  Handle the marker as described
10291                          * above.
10292                          */
10293                         newblk = WK_NEWBLK(wk);
10294                         if (newblk->nb_jnewblk != NULL &&
10295                             indirblk_lookup(newblk->nb_list.wk_mp,
10296                             newblk->nb_newblkno)) {
10297                                 LIST_REMOVE(&marker, wk_list);
10298                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10299                                 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10300                         }
10301                         continue;
10302
10303                 case D_SBDEP:
10304                         initiate_write_sbdep(WK_SBDEP(wk));
10305                         continue;
10306
10307                 case D_MKDIR:
10308                 case D_FREEWORK:
10309                 case D_FREEDEP:
10310                 case D_JSEGDEP:
10311                         continue;
10312
10313                 default:
10314                         panic("handle_disk_io_initiation: Unexpected type %s",
10315                             TYPENAME(wk->wk_type));
10316                         /* NOTREACHED */
10317                 }
10318         }
10319         FREE_LOCK(ump);
10320         PRELE(curproc);                 /* Allow swapout of kernel stack */
10321 }
10322
10323 /*
10324  * Called from within the procedure above to deal with unsatisfied
10325  * allocation dependencies in a directory. The buffer must be locked,
10326  * thus, no I/O completion operations can occur while we are
10327  * manipulating its associated dependencies.
10328  */
10329 static void
10330 initiate_write_filepage(struct pagedep *pagedep, struct buf *bp)
10331 {
10332         struct jremref *jremref;
10333         struct jmvref *jmvref;
10334         struct dirrem *dirrem;
10335         struct diradd *dap;
10336         struct direct *ep;
10337         int i;
10338
10339         if (pagedep->pd_state & IOSTARTED) {
10340                 /*
10341                  * This can only happen if there is a driver that does not
10342                  * understand chaining. Here biodone will reissue the call
10343                  * to strategy for the incomplete buffers.
10344                  */
10345                 printf("initiate_write_filepage: already started\n");
10346                 return;
10347         }
10348         pagedep->pd_state |= IOSTARTED;
10349         /*
10350          * Wait for all journal remove dependencies to hit the disk.
10351          * We can not allow any potentially conflicting directory adds
10352          * to be visible before removes and rollback is too difficult.
10353          * The per-filesystem lock may be dropped and re-acquired, however
10354          * we hold the buf locked so the dependency can not go away.
10355          */
10356         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10357                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10358                         jwait(&jremref->jr_list, MNT_WAIT);
10359         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10360                 jwait(&jmvref->jm_list, MNT_WAIT);
10361         for (i = 0; i < DAHASHSZ; i++) {
10362                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10363                         ep = (struct direct *)
10364                             ((char *)bp->b_data + dap->da_offset);
10365                         if (ep->d_ino != dap->da_newinum)
10366                                 panic("%s: dir inum %ju != new %ju",
10367                                     "initiate_write_filepage",
10368                                     (uintmax_t)ep->d_ino,
10369                                     (uintmax_t)dap->da_newinum);
10370                         if (dap->da_state & DIRCHG)
10371                                 ep->d_ino = dap->da_previous->dm_oldinum;
10372                         else
10373                                 ep->d_ino = 0;
10374                         dap->da_state &= ~ATTACHED;
10375                         dap->da_state |= UNDONE;
10376                 }
10377         }
10378 }
10379
10380 /*
10381  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10382  * Note that any bug fixes made to this routine must be done in the
10383  * version found below.
10384  *
10385  * Called from within the procedure above to deal with unsatisfied
10386  * allocation dependencies in an inodeblock. The buffer must be
10387  * locked, thus, no I/O completion operations can occur while we
10388  * are manipulating its associated dependencies.
10389  */
10390 static void
10391 initiate_write_inodeblock_ufs1(
10392         struct inodedep *inodedep,
10393         struct buf *bp)                 /* The inode block */
10394 {
10395         struct allocdirect *adp, *lastadp;
10396         struct ufs1_dinode *dp;
10397         struct ufs1_dinode *sip;
10398         struct inoref *inoref;
10399         struct ufsmount *ump;
10400         struct fs *fs;
10401         ufs_lbn_t i;
10402 #ifdef INVARIANTS
10403         ufs_lbn_t prevlbn = 0;
10404 #endif
10405         int deplist __diagused;
10406
10407         if (inodedep->id_state & IOSTARTED)
10408                 panic("initiate_write_inodeblock_ufs1: already started");
10409         inodedep->id_state |= IOSTARTED;
10410         fs = inodedep->id_fs;
10411         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10412         LOCK_OWNED(ump);
10413         dp = (struct ufs1_dinode *)bp->b_data +
10414             ino_to_fsbo(fs, inodedep->id_ino);
10415
10416         /*
10417          * If we're on the unlinked list but have not yet written our
10418          * next pointer initialize it here.
10419          */
10420         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10421                 struct inodedep *inon;
10422
10423                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10424                 dp->di_freelink = inon ? inon->id_ino : 0;
10425         }
10426         /*
10427          * If the bitmap is not yet written, then the allocated
10428          * inode cannot be written to disk.
10429          */
10430         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10431                 if (inodedep->id_savedino1 != NULL)
10432                         panic("initiate_write_inodeblock_ufs1: I/O underway");
10433                 FREE_LOCK(ump);
10434                 sip = malloc(sizeof(struct ufs1_dinode),
10435                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10436                 ACQUIRE_LOCK(ump);
10437                 inodedep->id_savedino1 = sip;
10438                 *inodedep->id_savedino1 = *dp;
10439                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10440                 dp->di_gen = inodedep->id_savedino1->di_gen;
10441                 dp->di_freelink = inodedep->id_savedino1->di_freelink;
10442                 return;
10443         }
10444         /*
10445          * If no dependencies, then there is nothing to roll back.
10446          */
10447         inodedep->id_savedsize = dp->di_size;
10448         inodedep->id_savedextsize = 0;
10449         inodedep->id_savednlink = dp->di_nlink;
10450         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10451             TAILQ_EMPTY(&inodedep->id_inoreflst))
10452                 return;
10453         /*
10454          * Revert the link count to that of the first unwritten journal entry.
10455          */
10456         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10457         if (inoref)
10458                 dp->di_nlink = inoref->if_nlink;
10459         /*
10460          * Set the dependencies to busy.
10461          */
10462         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10463              adp = TAILQ_NEXT(adp, ad_next)) {
10464 #ifdef INVARIANTS
10465                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10466                         panic("softdep_write_inodeblock: lbn order");
10467                 prevlbn = adp->ad_offset;
10468                 if (adp->ad_offset < UFS_NDADDR &&
10469                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10470                         panic("initiate_write_inodeblock_ufs1: "
10471                             "direct pointer #%jd mismatch %d != %jd",
10472                             (intmax_t)adp->ad_offset,
10473                             dp->di_db[adp->ad_offset],
10474                             (intmax_t)adp->ad_newblkno);
10475                 if (adp->ad_offset >= UFS_NDADDR &&
10476                     dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10477                         panic("initiate_write_inodeblock_ufs1: "
10478                             "indirect pointer #%jd mismatch %d != %jd",
10479                             (intmax_t)adp->ad_offset - UFS_NDADDR,
10480                             dp->di_ib[adp->ad_offset - UFS_NDADDR],
10481                             (intmax_t)adp->ad_newblkno);
10482                 deplist |= 1 << adp->ad_offset;
10483                 if ((adp->ad_state & ATTACHED) == 0)
10484                         panic("initiate_write_inodeblock_ufs1: "
10485                             "Unknown state 0x%x", adp->ad_state);
10486 #endif /* INVARIANTS */
10487                 adp->ad_state &= ~ATTACHED;
10488                 adp->ad_state |= UNDONE;
10489         }
10490         /*
10491          * The on-disk inode cannot claim to be any larger than the last
10492          * fragment that has been written. Otherwise, the on-disk inode
10493          * might have fragments that were not the last block in the file
10494          * which would corrupt the filesystem.
10495          */
10496         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10497              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10498                 if (adp->ad_offset >= UFS_NDADDR)
10499                         break;
10500                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10501                 /* keep going until hitting a rollback to a frag */
10502                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10503                         continue;
10504                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10505                 for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10506 #ifdef INVARIANTS
10507                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10508                                 panic("initiate_write_inodeblock_ufs1: "
10509                                     "lost dep1");
10510 #endif /* INVARIANTS */
10511                         dp->di_db[i] = 0;
10512                 }
10513                 for (i = 0; i < UFS_NIADDR; i++) {
10514 #ifdef INVARIANTS
10515                         if (dp->di_ib[i] != 0 &&
10516                             (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10517                                 panic("initiate_write_inodeblock_ufs1: "
10518                                     "lost dep2");
10519 #endif /* INVARIANTS */
10520                         dp->di_ib[i] = 0;
10521                 }
10522                 return;
10523         }
10524         /*
10525          * If we have zero'ed out the last allocated block of the file,
10526          * roll back the size to the last currently allocated block.
10527          * We know that this last allocated block is a full-sized as
10528          * we already checked for fragments in the loop above.
10529          */
10530         if (lastadp != NULL &&
10531             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10532                 for (i = lastadp->ad_offset; i >= 0; i--)
10533                         if (dp->di_db[i] != 0)
10534                                 break;
10535                 dp->di_size = (i + 1) * fs->fs_bsize;
10536         }
10537         /*
10538          * The only dependencies are for indirect blocks.
10539          *
10540          * The file size for indirect block additions is not guaranteed.
10541          * Such a guarantee would be non-trivial to achieve. The conventional
10542          * synchronous write implementation also does not make this guarantee.
10543          * Fsck should catch and fix discrepancies. Arguably, the file size
10544          * can be over-estimated without destroying integrity when the file
10545          * moves into the indirect blocks (i.e., is large). If we want to
10546          * postpone fsck, we are stuck with this argument.
10547          */
10548         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10549                 dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10550 }
10551
10552 /*
10553  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10554  * Note that any bug fixes made to this routine must be done in the
10555  * version found above.
10556  *
10557  * Called from within the procedure above to deal with unsatisfied
10558  * allocation dependencies in an inodeblock. The buffer must be
10559  * locked, thus, no I/O completion operations can occur while we
10560  * are manipulating its associated dependencies.
10561  */
10562 static void
10563 initiate_write_inodeblock_ufs2(
10564         struct inodedep *inodedep,
10565         struct buf *bp)                 /* The inode block */
10566 {
10567         struct allocdirect *adp, *lastadp;
10568         struct ufs2_dinode *dp;
10569         struct ufs2_dinode *sip;
10570         struct inoref *inoref;
10571         struct ufsmount *ump;
10572         struct fs *fs;
10573         ufs_lbn_t i;
10574 #ifdef INVARIANTS
10575         ufs_lbn_t prevlbn = 0;
10576 #endif
10577         int deplist __diagused;
10578
10579         if (inodedep->id_state & IOSTARTED)
10580                 panic("initiate_write_inodeblock_ufs2: already started");
10581         inodedep->id_state |= IOSTARTED;
10582         fs = inodedep->id_fs;
10583         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10584         LOCK_OWNED(ump);
10585         dp = (struct ufs2_dinode *)bp->b_data +
10586             ino_to_fsbo(fs, inodedep->id_ino);
10587
10588         /*
10589          * If we're on the unlinked list but have not yet written our
10590          * next pointer initialize it here.
10591          */
10592         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10593                 struct inodedep *inon;
10594
10595                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10596                 dp->di_freelink = inon ? inon->id_ino : 0;
10597                 ffs_update_dinode_ckhash(fs, dp);
10598         }
10599         /*
10600          * If the bitmap is not yet written, then the allocated
10601          * inode cannot be written to disk.
10602          */
10603         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10604                 if (inodedep->id_savedino2 != NULL)
10605                         panic("initiate_write_inodeblock_ufs2: I/O underway");
10606                 FREE_LOCK(ump);
10607                 sip = malloc(sizeof(struct ufs2_dinode),
10608                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10609                 ACQUIRE_LOCK(ump);
10610                 inodedep->id_savedino2 = sip;
10611                 *inodedep->id_savedino2 = *dp;
10612                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10613                 dp->di_gen = inodedep->id_savedino2->di_gen;
10614                 dp->di_freelink = inodedep->id_savedino2->di_freelink;
10615                 return;
10616         }
10617         /*
10618          * If no dependencies, then there is nothing to roll back.
10619          */
10620         inodedep->id_savedsize = dp->di_size;
10621         inodedep->id_savedextsize = dp->di_extsize;
10622         inodedep->id_savednlink = dp->di_nlink;
10623         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10624             TAILQ_EMPTY(&inodedep->id_extupdt) &&
10625             TAILQ_EMPTY(&inodedep->id_inoreflst))
10626                 return;
10627         /*
10628          * Revert the link count to that of the first unwritten journal entry.
10629          */
10630         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10631         if (inoref)
10632                 dp->di_nlink = inoref->if_nlink;
10633
10634         /*
10635          * Set the ext data dependencies to busy.
10636          */
10637         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10638              adp = TAILQ_NEXT(adp, ad_next)) {
10639 #ifdef INVARIANTS
10640                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10641                         panic("initiate_write_inodeblock_ufs2: lbn order");
10642                 prevlbn = adp->ad_offset;
10643                 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10644                         panic("initiate_write_inodeblock_ufs2: "
10645                             "ext pointer #%jd mismatch %jd != %jd",
10646                             (intmax_t)adp->ad_offset,
10647                             (intmax_t)dp->di_extb[adp->ad_offset],
10648                             (intmax_t)adp->ad_newblkno);
10649                 deplist |= 1 << adp->ad_offset;
10650                 if ((adp->ad_state & ATTACHED) == 0)
10651                         panic("initiate_write_inodeblock_ufs2: Unknown "
10652                             "state 0x%x", adp->ad_state);
10653 #endif /* INVARIANTS */
10654                 adp->ad_state &= ~ATTACHED;
10655                 adp->ad_state |= UNDONE;
10656         }
10657         /*
10658          * The on-disk inode cannot claim to be any larger than the last
10659          * fragment that has been written. Otherwise, the on-disk inode
10660          * might have fragments that were not the last block in the ext
10661          * data which would corrupt the filesystem.
10662          */
10663         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10664              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10665                 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10666                 /* keep going until hitting a rollback to a frag */
10667                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10668                         continue;
10669                 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10670                 for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
10671 #ifdef INVARIANTS
10672                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10673                                 panic("initiate_write_inodeblock_ufs2: "
10674                                     "lost dep1");
10675 #endif /* INVARIANTS */
10676                         dp->di_extb[i] = 0;
10677                 }
10678                 lastadp = NULL;
10679                 break;
10680         }
10681         /*
10682          * If we have zero'ed out the last allocated block of the ext
10683          * data, roll back the size to the last currently allocated block.
10684          * We know that this last allocated block is a full-sized as
10685          * we already checked for fragments in the loop above.
10686          */
10687         if (lastadp != NULL &&
10688             dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10689                 for (i = lastadp->ad_offset; i >= 0; i--)
10690                         if (dp->di_extb[i] != 0)
10691                                 break;
10692                 dp->di_extsize = (i + 1) * fs->fs_bsize;
10693         }
10694         /*
10695          * Set the file data dependencies to busy.
10696          */
10697         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10698              adp = TAILQ_NEXT(adp, ad_next)) {
10699 #ifdef INVARIANTS
10700                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10701                         panic("softdep_write_inodeblock: lbn order");
10702                 if ((adp->ad_state & ATTACHED) == 0)
10703                         panic("inodedep %p and adp %p not attached", inodedep, adp);
10704                 prevlbn = adp->ad_offset;
10705                 if (!ffs_fsfail_cleanup(ump, 0) &&
10706                     adp->ad_offset < UFS_NDADDR &&
10707                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10708                         panic("initiate_write_inodeblock_ufs2: "
10709                             "direct pointer #%jd mismatch %jd != %jd",
10710                             (intmax_t)adp->ad_offset,
10711                             (intmax_t)dp->di_db[adp->ad_offset],
10712                             (intmax_t)adp->ad_newblkno);
10713                 if (!ffs_fsfail_cleanup(ump, 0) &&
10714                     adp->ad_offset >= UFS_NDADDR &&
10715                     dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10716                         panic("initiate_write_inodeblock_ufs2: "
10717                             "indirect pointer #%jd mismatch %jd != %jd",
10718                             (intmax_t)adp->ad_offset - UFS_NDADDR,
10719                             (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
10720                             (intmax_t)adp->ad_newblkno);
10721                 deplist |= 1 << adp->ad_offset;
10722                 if ((adp->ad_state & ATTACHED) == 0)
10723                         panic("initiate_write_inodeblock_ufs2: Unknown "
10724                              "state 0x%x", adp->ad_state);
10725 #endif /* INVARIANTS */
10726                 adp->ad_state &= ~ATTACHED;
10727                 adp->ad_state |= UNDONE;
10728         }
10729         /*
10730          * The on-disk inode cannot claim to be any larger than the last
10731          * fragment that has been written. Otherwise, the on-disk inode
10732          * might have fragments that were not the last block in the file
10733          * which would corrupt the filesystem.
10734          */
10735         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10736              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10737                 if (adp->ad_offset >= UFS_NDADDR)
10738                         break;
10739                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10740                 /* keep going until hitting a rollback to a frag */
10741                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10742                         continue;
10743                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10744                 for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10745 #ifdef INVARIANTS
10746                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10747                                 panic("initiate_write_inodeblock_ufs2: "
10748                                     "lost dep2");
10749 #endif /* INVARIANTS */
10750                         dp->di_db[i] = 0;
10751                 }
10752                 for (i = 0; i < UFS_NIADDR; i++) {
10753 #ifdef INVARIANTS
10754                         if (dp->di_ib[i] != 0 &&
10755                             (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10756                                 panic("initiate_write_inodeblock_ufs2: "
10757                                     "lost dep3");
10758 #endif /* INVARIANTS */
10759                         dp->di_ib[i] = 0;
10760                 }
10761                 ffs_update_dinode_ckhash(fs, dp);
10762                 return;
10763         }
10764         /*
10765          * If we have zero'ed out the last allocated block of the file,
10766          * roll back the size to the last currently allocated block.
10767          * We know that this last allocated block is a full-sized as
10768          * we already checked for fragments in the loop above.
10769          */
10770         if (lastadp != NULL &&
10771             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10772                 for (i = lastadp->ad_offset; i >= 0; i--)
10773                         if (dp->di_db[i] != 0)
10774                                 break;
10775                 dp->di_size = (i + 1) * fs->fs_bsize;
10776         }
10777         /*
10778          * The only dependencies are for indirect blocks.
10779          *
10780          * The file size for indirect block additions is not guaranteed.
10781          * Such a guarantee would be non-trivial to achieve. The conventional
10782          * synchronous write implementation also does not make this guarantee.
10783          * Fsck should catch and fix discrepancies. Arguably, the file size
10784          * can be over-estimated without destroying integrity when the file
10785          * moves into the indirect blocks (i.e., is large). If we want to
10786          * postpone fsck, we are stuck with this argument.
10787          */
10788         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10789                 dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10790         ffs_update_dinode_ckhash(fs, dp);
10791 }
10792
10793 /*
10794  * Cancel an indirdep as a result of truncation.  Release all of the
10795  * children allocindirs and place their journal work on the appropriate
10796  * list.
10797  */
10798 static void
10799 cancel_indirdep(
10800         struct indirdep *indirdep,
10801         struct buf *bp,
10802         struct freeblks *freeblks)
10803 {
10804         struct allocindir *aip;
10805
10806         /*
10807          * None of the indirect pointers will ever be visible,
10808          * so they can simply be tossed. GOINGAWAY ensures
10809          * that allocated pointers will be saved in the buffer
10810          * cache until they are freed. Note that they will
10811          * only be able to be found by their physical address
10812          * since the inode mapping the logical address will
10813          * be gone. The save buffer used for the safe copy
10814          * was allocated in setup_allocindir_phase2 using
10815          * the physical address so it could be used for this
10816          * purpose. Hence we swap the safe copy with the real
10817          * copy, allowing the safe copy to be freed and holding
10818          * on to the real copy for later use in indir_trunc.
10819          */
10820         if (indirdep->ir_state & GOINGAWAY)
10821                 panic("cancel_indirdep: already gone");
10822         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10823                 indirdep->ir_state |= DEPCOMPLETE;
10824                 LIST_REMOVE(indirdep, ir_next);
10825         }
10826         indirdep->ir_state |= GOINGAWAY;
10827         /*
10828          * Pass in bp for blocks still have journal writes
10829          * pending so we can cancel them on their own.
10830          */
10831         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10832                 cancel_allocindir(aip, bp, freeblks, 0);
10833         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10834                 cancel_allocindir(aip, NULL, freeblks, 0);
10835         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10836                 cancel_allocindir(aip, NULL, freeblks, 0);
10837         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10838                 cancel_allocindir(aip, NULL, freeblks, 0);
10839         /*
10840          * If there are pending partial truncations we need to keep the
10841          * old block copy around until they complete.  This is because
10842          * the current b_data is not a perfect superset of the available
10843          * blocks.
10844          */
10845         if (TAILQ_EMPTY(&indirdep->ir_trunc))
10846                 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10847         else
10848                 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10849         WORKLIST_REMOVE(&indirdep->ir_list);
10850         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10851         indirdep->ir_bp = NULL;
10852         indirdep->ir_freeblks = freeblks;
10853 }
10854
10855 /*
10856  * Free an indirdep once it no longer has new pointers to track.
10857  */
10858 static void
10859 free_indirdep(struct indirdep *indirdep)
10860 {
10861
10862         KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10863             ("free_indirdep: Indir trunc list not empty."));
10864         KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10865             ("free_indirdep: Complete head not empty."));
10866         KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10867             ("free_indirdep: write head not empty."));
10868         KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10869             ("free_indirdep: done head not empty."));
10870         KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10871             ("free_indirdep: deplist head not empty."));
10872         KASSERT((indirdep->ir_state & DEPCOMPLETE),
10873             ("free_indirdep: %p still on newblk list.", indirdep));
10874         KASSERT(indirdep->ir_saveddata == NULL,
10875             ("free_indirdep: %p still has saved data.", indirdep));
10876         KASSERT(indirdep->ir_savebp == NULL,
10877             ("free_indirdep: %p still has savebp buffer.", indirdep));
10878         if (indirdep->ir_state & ONWORKLIST)
10879                 WORKLIST_REMOVE(&indirdep->ir_list);
10880         WORKITEM_FREE(indirdep, D_INDIRDEP);
10881 }
10882
10883 /*
10884  * Called before a write to an indirdep.  This routine is responsible for
10885  * rolling back pointers to a safe state which includes only those
10886  * allocindirs which have been completed.
10887  */
10888 static void
10889 initiate_write_indirdep(struct indirdep *indirdep, struct buf *bp)
10890 {
10891         struct ufsmount *ump;
10892
10893         indirdep->ir_state |= IOSTARTED;
10894         if (indirdep->ir_state & GOINGAWAY)
10895                 panic("disk_io_initiation: indirdep gone");
10896         /*
10897          * If there are no remaining dependencies, this will be writing
10898          * the real pointers.
10899          */
10900         if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10901             TAILQ_EMPTY(&indirdep->ir_trunc))
10902                 return;
10903         /*
10904          * Replace up-to-date version with safe version.
10905          */
10906         if (indirdep->ir_saveddata == NULL) {
10907                 ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10908                 LOCK_OWNED(ump);
10909                 FREE_LOCK(ump);
10910                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10911                     M_SOFTDEP_FLAGS);
10912                 ACQUIRE_LOCK(ump);
10913         }
10914         indirdep->ir_state &= ~ATTACHED;
10915         indirdep->ir_state |= UNDONE;
10916         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10917         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10918             bp->b_bcount);
10919 }
10920
10921 /*
10922  * Called when an inode has been cleared in a cg bitmap.  This finally
10923  * eliminates any canceled jaddrefs
10924  */
10925 void
10926 softdep_setup_inofree(struct mount *mp,
10927         struct buf *bp,
10928         ino_t ino,
10929         struct workhead *wkhd,
10930         bool doingrecovery)
10931 {
10932         struct worklist *wk, *wkn;
10933         struct ufsmount *ump;
10934 #ifdef INVARIANTS
10935         struct inodedep *inodedep;
10936 #endif
10937
10938         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10939             ("softdep_setup_inofree called on non-softdep filesystem"));
10940         ump = VFSTOUFS(mp);
10941         ACQUIRE_LOCK(ump);
10942         KASSERT(doingrecovery || ffs_fsfail_cleanup(ump, 0) ||
10943             isclr(cg_inosused((struct cg *)bp->b_data),
10944             ino % ump->um_fs->fs_ipg),
10945             ("softdep_setup_inofree: inode %ju not freed.", (uintmax_t)ino));
10946         KASSERT(inodedep_lookup(mp, ino, 0, &inodedep) == 0,
10947             ("softdep_setup_inofree: ino %ju has existing inodedep %p",
10948             (uintmax_t)ino, inodedep));
10949         if (wkhd) {
10950                 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10951                         if (wk->wk_type != D_JADDREF)
10952                                 continue;
10953                         WORKLIST_REMOVE(wk);
10954                         /*
10955                          * We can free immediately even if the jaddref
10956                          * isn't attached in a background write as now
10957                          * the bitmaps are reconciled.
10958                          */
10959                         wk->wk_state |= COMPLETE | ATTACHED;
10960                         free_jaddref(WK_JADDREF(wk));
10961                 }
10962                 jwork_move(&bp->b_dep, wkhd);
10963         }
10964         FREE_LOCK(ump);
10965 }
10966
10967 /*
10968  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10969  * map.  Any dependencies waiting for the write to clear are added to the
10970  * buf's list and any jnewblks that are being canceled are discarded
10971  * immediately.
10972  */
10973 void
10974 softdep_setup_blkfree(
10975         struct mount *mp,
10976         struct buf *bp,
10977         ufs2_daddr_t blkno,
10978         int frags,
10979         struct workhead *wkhd,
10980         bool doingrecovery)
10981 {
10982         struct bmsafemap *bmsafemap;
10983         struct jnewblk *jnewblk;
10984         struct ufsmount *ump;
10985         struct worklist *wk;
10986         struct fs *fs;
10987 #ifdef INVARIANTS
10988         uint8_t *blksfree;
10989         struct cg *cgp;
10990         ufs2_daddr_t jstart;
10991         ufs2_daddr_t jend;
10992         ufs2_daddr_t end;
10993         long bno;
10994         int i;
10995 #endif
10996
10997         CTR3(KTR_SUJ,
10998             "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10999             blkno, frags, wkhd);
11000
11001         ump = VFSTOUFS(mp);
11002         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
11003             ("softdep_setup_blkfree called on non-softdep filesystem"));
11004         ACQUIRE_LOCK(ump);
11005         /* Lookup the bmsafemap so we track when it is dirty. */
11006         fs = ump->um_fs;
11007         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
11008         /*
11009          * Detach any jnewblks which have been canceled.  They must linger
11010          * until the bitmap is cleared again by ffs_blkfree() to prevent
11011          * an unjournaled allocation from hitting the disk.
11012          */
11013         if (wkhd) {
11014                 while ((wk = LIST_FIRST(wkhd)) != NULL) {
11015                         CTR2(KTR_SUJ,
11016                             "softdep_setup_blkfree: blkno %jd wk type %d",
11017                             blkno, wk->wk_type);
11018                         WORKLIST_REMOVE(wk);
11019                         if (wk->wk_type != D_JNEWBLK) {
11020                                 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
11021                                 continue;
11022                         }
11023                         jnewblk = WK_JNEWBLK(wk);
11024                         KASSERT(jnewblk->jn_state & GOINGAWAY,
11025                             ("softdep_setup_blkfree: jnewblk not canceled."));
11026 #ifdef INVARIANTS
11027                         if (!doingrecovery && !ffs_fsfail_cleanup(ump, 0)) {
11028                                 /*
11029                                  * Assert that this block is free in the
11030                                  * bitmap before we discard the jnewblk.
11031                                  */
11032                                 cgp = (struct cg *)bp->b_data;
11033                                 blksfree = cg_blksfree(cgp);
11034                                 bno = dtogd(fs, jnewblk->jn_blkno);
11035                                 for (i = jnewblk->jn_oldfrags;
11036                                     i < jnewblk->jn_frags; i++) {
11037                                         if (isset(blksfree, bno + i))
11038                                                 continue;
11039                                         panic("softdep_setup_blkfree: block "
11040                                             "%ju not freed.",
11041                                             (uintmax_t)jnewblk->jn_blkno);
11042                                 }
11043                         }
11044 #endif
11045                         /*
11046                          * Even if it's not attached we can free immediately
11047                          * as the new bitmap is correct.
11048                          */
11049                         wk->wk_state |= COMPLETE | ATTACHED;
11050                         free_jnewblk(jnewblk);
11051                 }
11052         }
11053
11054 #ifdef INVARIANTS
11055         /*
11056          * Assert that we are not freeing a block which has an outstanding
11057          * allocation dependency.
11058          */
11059         fs = VFSTOUFS(mp)->um_fs;
11060         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
11061         end = blkno + frags;
11062         LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11063                 /*
11064                  * Don't match against blocks that will be freed when the
11065                  * background write is done.
11066                  */
11067                 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
11068                     (COMPLETE | DEPCOMPLETE))
11069                         continue;
11070                 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
11071                 jend = jnewblk->jn_blkno + jnewblk->jn_frags;
11072                 if ((blkno >= jstart && blkno < jend) ||
11073                     (end > jstart && end <= jend)) {
11074                         printf("state 0x%X %jd - %d %d dep %p\n",
11075                             jnewblk->jn_state, jnewblk->jn_blkno,
11076                             jnewblk->jn_oldfrags, jnewblk->jn_frags,
11077                             jnewblk->jn_dep);
11078                         panic("softdep_setup_blkfree: "
11079                             "%jd-%jd(%d) overlaps with %jd-%jd",
11080                             blkno, end, frags, jstart, jend);
11081                 }
11082         }
11083 #endif
11084         FREE_LOCK(ump);
11085 }
11086
11087 /*
11088  * Revert a block allocation when the journal record that describes it
11089  * is not yet written.
11090  */
11091 static int
11092 jnewblk_rollback(
11093         struct jnewblk *jnewblk,
11094         struct fs *fs,
11095         struct cg *cgp,
11096         uint8_t *blksfree)
11097 {
11098         ufs1_daddr_t fragno;
11099         long cgbno, bbase;
11100         int frags, blk;
11101         int i;
11102
11103         frags = 0;
11104         cgbno = dtogd(fs, jnewblk->jn_blkno);
11105         /*
11106          * We have to test which frags need to be rolled back.  We may
11107          * be operating on a stale copy when doing background writes.
11108          */
11109         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
11110                 if (isclr(blksfree, cgbno + i))
11111                         frags++;
11112         if (frags == 0)
11113                 return (0);
11114         /*
11115          * This is mostly ffs_blkfree() sans some validation and
11116          * superblock updates.
11117          */
11118         if (frags == fs->fs_frag) {
11119                 fragno = fragstoblks(fs, cgbno);
11120                 ffs_setblock(fs, blksfree, fragno);
11121                 ffs_clusteracct(fs, cgp, fragno, 1);
11122                 cgp->cg_cs.cs_nbfree++;
11123         } else {
11124                 cgbno += jnewblk->jn_oldfrags;
11125                 bbase = cgbno - fragnum(fs, cgbno);
11126                 /* Decrement the old frags.  */
11127                 blk = blkmap(fs, blksfree, bbase);
11128                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11129                 /* Deallocate the fragment */
11130                 for (i = 0; i < frags; i++)
11131                         setbit(blksfree, cgbno + i);
11132                 cgp->cg_cs.cs_nffree += frags;
11133                 /* Add back in counts associated with the new frags */
11134                 blk = blkmap(fs, blksfree, bbase);
11135                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11136                 /* If a complete block has been reassembled, account for it. */
11137                 fragno = fragstoblks(fs, bbase);
11138                 if (ffs_isblock(fs, blksfree, fragno)) {
11139                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
11140                         ffs_clusteracct(fs, cgp, fragno, 1);
11141                         cgp->cg_cs.cs_nbfree++;
11142                 }
11143         }
11144         stat_jnewblk++;
11145         jnewblk->jn_state &= ~ATTACHED;
11146         jnewblk->jn_state |= UNDONE;
11147
11148         return (frags);
11149 }
11150
11151 static void
11152 initiate_write_bmsafemap(
11153         struct bmsafemap *bmsafemap,
11154         struct buf *bp)                 /* The cg block. */
11155 {
11156         struct jaddref *jaddref;
11157         struct jnewblk *jnewblk;
11158         uint8_t *inosused;
11159         uint8_t *blksfree;
11160         struct cg *cgp;
11161         struct fs *fs;
11162         ino_t ino;
11163
11164         /*
11165          * If this is a background write, we did this at the time that
11166          * the copy was made, so do not need to do it again.
11167          */
11168         if (bmsafemap->sm_state & IOSTARTED)
11169                 return;
11170         bmsafemap->sm_state |= IOSTARTED;
11171         /*
11172          * Clear any inode allocations which are pending journal writes.
11173          */
11174         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
11175                 cgp = (struct cg *)bp->b_data;
11176                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11177                 inosused = cg_inosused(cgp);
11178                 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
11179                         ino = jaddref->ja_ino % fs->fs_ipg;
11180                         if (isset(inosused, ino)) {
11181                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
11182                                         cgp->cg_cs.cs_ndir--;
11183                                 cgp->cg_cs.cs_nifree++;
11184                                 clrbit(inosused, ino);
11185                                 jaddref->ja_state &= ~ATTACHED;
11186                                 jaddref->ja_state |= UNDONE;
11187                                 stat_jaddref++;
11188                         } else
11189                                 panic("initiate_write_bmsafemap: inode %ju "
11190                                     "marked free", (uintmax_t)jaddref->ja_ino);
11191                 }
11192         }
11193         /*
11194          * Clear any block allocations which are pending journal writes.
11195          */
11196         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11197                 cgp = (struct cg *)bp->b_data;
11198                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11199                 blksfree = cg_blksfree(cgp);
11200                 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11201                         if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
11202                                 continue;
11203                         panic("initiate_write_bmsafemap: block %jd "
11204                             "marked free", jnewblk->jn_blkno);
11205                 }
11206         }
11207         /*
11208          * Move allocation lists to the written lists so they can be
11209          * cleared once the block write is complete.
11210          */
11211         LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
11212             inodedep, id_deps);
11213         LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11214             newblk, nb_deps);
11215         LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
11216             wk_list);
11217 }
11218
11219 void
11220 softdep_handle_error(struct buf *bp)
11221 {
11222         struct ufsmount *ump;
11223
11224         ump = softdep_bp_to_mp(bp);
11225         if (ump == NULL)
11226                 return;
11227
11228         if (ffs_fsfail_cleanup(ump, bp->b_error)) {
11229                 /*
11230                  * No future writes will succeed, so the on-disk image is safe.
11231                  * Pretend that this write succeeded so that the softdep state
11232                  * will be cleaned up naturally.
11233                  */
11234                 bp->b_ioflags &= ~BIO_ERROR;
11235                 bp->b_error = 0;
11236         }
11237 }
11238
11239 /*
11240  * This routine is called during the completion interrupt
11241  * service routine for a disk write (from the procedure called
11242  * by the device driver to inform the filesystem caches of
11243  * a request completion).  It should be called early in this
11244  * procedure, before the block is made available to other
11245  * processes or other routines are called.
11246  *
11247  */
11248 static void
11249 softdep_disk_write_complete(
11250         struct buf *bp)         /* describes the completed disk write */
11251 {
11252         struct worklist *wk;
11253         struct worklist *owk;
11254         struct ufsmount *ump;
11255         struct workhead reattach;
11256         struct freeblks *freeblks;
11257         struct buf *sbp;
11258
11259         ump = softdep_bp_to_mp(bp);
11260         KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL,
11261             ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL "
11262              "with outstanding dependencies for buffer %p", bp));
11263         if (ump == NULL)
11264                 return;
11265         if ((bp->b_ioflags & BIO_ERROR) != 0)
11266                 softdep_handle_error(bp);
11267         /*
11268          * If an error occurred while doing the write, then the data
11269          * has not hit the disk and the dependencies cannot be processed.
11270          * But we do have to go through and roll forward any dependencies
11271          * that were rolled back before the disk write.
11272          */
11273         sbp = NULL;
11274         ACQUIRE_LOCK(ump);
11275         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
11276                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11277                         switch (wk->wk_type) {
11278                         case D_PAGEDEP:
11279                                 handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
11280                                 continue;
11281
11282                         case D_INODEDEP:
11283                                 handle_written_inodeblock(WK_INODEDEP(wk),
11284                                     bp, 0);
11285                                 continue;
11286
11287                         case D_BMSAFEMAP:
11288                                 handle_written_bmsafemap(WK_BMSAFEMAP(wk),
11289                                     bp, 0);
11290                                 continue;
11291
11292                         case D_INDIRDEP:
11293                                 handle_written_indirdep(WK_INDIRDEP(wk),
11294                                     bp, &sbp, 0);
11295                                 continue;
11296                         default:
11297                                 /* nothing to roll forward */
11298                                 continue;
11299                         }
11300                 }
11301                 FREE_LOCK(ump);
11302                 if (sbp)
11303                         brelse(sbp);
11304                 return;
11305         }
11306         LIST_INIT(&reattach);
11307
11308         /*
11309          * Ump SU lock must not be released anywhere in this code segment.
11310          */
11311         owk = NULL;
11312         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
11313                 WORKLIST_REMOVE(wk);
11314                 atomic_add_long(&dep_write[wk->wk_type], 1);
11315                 if (wk == owk)
11316                         panic("duplicate worklist: %p\n", wk);
11317                 owk = wk;
11318                 switch (wk->wk_type) {
11319                 case D_PAGEDEP:
11320                         if (handle_written_filepage(WK_PAGEDEP(wk), bp,
11321                             WRITESUCCEEDED))
11322                                 WORKLIST_INSERT(&reattach, wk);
11323                         continue;
11324
11325                 case D_INODEDEP:
11326                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11327                             WRITESUCCEEDED))
11328                                 WORKLIST_INSERT(&reattach, wk);
11329                         continue;
11330
11331                 case D_BMSAFEMAP:
11332                         if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11333                             WRITESUCCEEDED))
11334                                 WORKLIST_INSERT(&reattach, wk);
11335                         continue;
11336
11337                 case D_MKDIR:
11338                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11339                         continue;
11340
11341                 case D_ALLOCDIRECT:
11342                         wk->wk_state |= COMPLETE;
11343                         handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11344                         continue;
11345
11346                 case D_ALLOCINDIR:
11347                         wk->wk_state |= COMPLETE;
11348                         handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11349                         continue;
11350
11351                 case D_INDIRDEP:
11352                         if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11353                             WRITESUCCEEDED))
11354                                 WORKLIST_INSERT(&reattach, wk);
11355                         continue;
11356
11357                 case D_FREEBLKS:
11358                         wk->wk_state |= COMPLETE;
11359                         freeblks = WK_FREEBLKS(wk);
11360                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11361                             LIST_EMPTY(&freeblks->fb_jblkdephd))
11362                                 add_to_worklist(wk, WK_NODELAY);
11363                         continue;
11364
11365                 case D_FREEWORK:
11366                         handle_written_freework(WK_FREEWORK(wk));
11367                         break;
11368
11369                 case D_JSEGDEP:
11370                         free_jsegdep(WK_JSEGDEP(wk));
11371                         continue;
11372
11373                 case D_JSEG:
11374                         handle_written_jseg(WK_JSEG(wk), bp);
11375                         continue;
11376
11377                 case D_SBDEP:
11378                         if (handle_written_sbdep(WK_SBDEP(wk), bp))
11379                                 WORKLIST_INSERT(&reattach, wk);
11380                         continue;
11381
11382                 case D_FREEDEP:
11383                         free_freedep(WK_FREEDEP(wk));
11384                         continue;
11385
11386                 default:
11387                         panic("handle_disk_write_complete: Unknown type %s",
11388                             TYPENAME(wk->wk_type));
11389                         /* NOTREACHED */
11390                 }
11391         }
11392         /*
11393          * Reattach any requests that must be redone.
11394          */
11395         while ((wk = LIST_FIRST(&reattach)) != NULL) {
11396                 WORKLIST_REMOVE(wk);
11397                 WORKLIST_INSERT(&bp->b_dep, wk);
11398         }
11399         FREE_LOCK(ump);
11400         if (sbp)
11401                 brelse(sbp);
11402 }
11403
11404 /*
11405  * Called from within softdep_disk_write_complete above.
11406  */
11407 static void
11408 handle_allocdirect_partdone(
11409         struct allocdirect *adp,        /* the completed allocdirect */
11410         struct workhead *wkhd)          /* Work to do when inode is writtne. */
11411 {
11412         struct allocdirectlst *listhead;
11413         struct allocdirect *listadp;
11414         struct inodedep *inodedep;
11415         long bsize;
11416
11417         LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp));
11418         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11419                 return;
11420         /*
11421          * The on-disk inode cannot claim to be any larger than the last
11422          * fragment that has been written. Otherwise, the on-disk inode
11423          * might have fragments that were not the last block in the file
11424          * which would corrupt the filesystem. Thus, we cannot free any
11425          * allocdirects after one whose ad_oldblkno claims a fragment as
11426          * these blocks must be rolled back to zero before writing the inode.
11427          * We check the currently active set of allocdirects in id_inoupdt
11428          * or id_extupdt as appropriate.
11429          */
11430         inodedep = adp->ad_inodedep;
11431         bsize = inodedep->id_fs->fs_bsize;
11432         if (adp->ad_state & EXTDATA)
11433                 listhead = &inodedep->id_extupdt;
11434         else
11435                 listhead = &inodedep->id_inoupdt;
11436         TAILQ_FOREACH(listadp, listhead, ad_next) {
11437                 /* found our block */
11438                 if (listadp == adp)
11439                         break;
11440                 /* continue if ad_oldlbn is not a fragment */
11441                 if (listadp->ad_oldsize == 0 ||
11442                     listadp->ad_oldsize == bsize)
11443                         continue;
11444                 /* hit a fragment */
11445                 return;
11446         }
11447         /*
11448          * If we have reached the end of the current list without
11449          * finding the just finished dependency, then it must be
11450          * on the future dependency list. Future dependencies cannot
11451          * be freed until they are moved to the current list.
11452          */
11453         if (listadp == NULL) {
11454 #ifdef INVARIANTS
11455                 if (adp->ad_state & EXTDATA)
11456                         listhead = &inodedep->id_newextupdt;
11457                 else
11458                         listhead = &inodedep->id_newinoupdt;
11459                 TAILQ_FOREACH(listadp, listhead, ad_next)
11460                         /* found our block */
11461                         if (listadp == adp)
11462                                 break;
11463                 if (listadp == NULL)
11464                         panic("handle_allocdirect_partdone: lost dep");
11465 #endif /* INVARIANTS */
11466                 return;
11467         }
11468         /*
11469          * If we have found the just finished dependency, then queue
11470          * it along with anything that follows it that is complete.
11471          * Since the pointer has not yet been written in the inode
11472          * as the dependency prevents it, place the allocdirect on the
11473          * bufwait list where it will be freed once the pointer is
11474          * valid.
11475          */
11476         if (wkhd == NULL)
11477                 wkhd = &inodedep->id_bufwait;
11478         for (; adp; adp = listadp) {
11479                 listadp = TAILQ_NEXT(adp, ad_next);
11480                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11481                         return;
11482                 TAILQ_REMOVE(listhead, adp, ad_next);
11483                 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11484         }
11485 }
11486
11487 /*
11488  * Called from within softdep_disk_write_complete above.  This routine
11489  * completes successfully written allocindirs.
11490  */
11491 static void
11492 handle_allocindir_partdone(
11493         struct allocindir *aip)         /* the completed allocindir */
11494 {
11495         struct indirdep *indirdep;
11496
11497         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11498                 return;
11499         indirdep = aip->ai_indirdep;
11500         LIST_REMOVE(aip, ai_next);
11501         /*
11502          * Don't set a pointer while the buffer is undergoing IO or while
11503          * we have active truncations.
11504          */
11505         if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11506                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11507                 return;
11508         }
11509         if (indirdep->ir_state & UFS1FMT)
11510                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11511                     aip->ai_newblkno;
11512         else
11513                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11514                     aip->ai_newblkno;
11515         /*
11516          * Await the pointer write before freeing the allocindir.
11517          */
11518         LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11519 }
11520
11521 /*
11522  * Release segments held on a jwork list.
11523  */
11524 static void
11525 handle_jwork(struct workhead *wkhd)
11526 {
11527         struct worklist *wk;
11528
11529         while ((wk = LIST_FIRST(wkhd)) != NULL) {
11530                 WORKLIST_REMOVE(wk);
11531                 switch (wk->wk_type) {
11532                 case D_JSEGDEP:
11533                         free_jsegdep(WK_JSEGDEP(wk));
11534                         continue;
11535                 case D_FREEDEP:
11536                         free_freedep(WK_FREEDEP(wk));
11537                         continue;
11538                 case D_FREEFRAG:
11539                         rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11540                         WORKITEM_FREE(wk, D_FREEFRAG);
11541                         continue;
11542                 case D_FREEWORK:
11543                         handle_written_freework(WK_FREEWORK(wk));
11544                         continue;
11545                 default:
11546                         panic("handle_jwork: Unknown type %s\n",
11547                             TYPENAME(wk->wk_type));
11548                 }
11549         }
11550 }
11551
11552 /*
11553  * Handle the bufwait list on an inode when it is safe to release items
11554  * held there.  This normally happens after an inode block is written but
11555  * may be delayed and handled later if there are pending journal items that
11556  * are not yet safe to be released.
11557  */
11558 static struct freefile *
11559 handle_bufwait(
11560         struct inodedep *inodedep,
11561         struct workhead *refhd)
11562 {
11563         struct jaddref *jaddref;
11564         struct freefile *freefile;
11565         struct worklist *wk;
11566
11567         freefile = NULL;
11568         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11569                 WORKLIST_REMOVE(wk);
11570                 switch (wk->wk_type) {
11571                 case D_FREEFILE:
11572                         /*
11573                          * We defer adding freefile to the worklist
11574                          * until all other additions have been made to
11575                          * ensure that it will be done after all the
11576                          * old blocks have been freed.
11577                          */
11578                         if (freefile != NULL)
11579                                 panic("handle_bufwait: freefile");
11580                         freefile = WK_FREEFILE(wk);
11581                         continue;
11582
11583                 case D_MKDIR:
11584                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11585                         continue;
11586
11587                 case D_DIRADD:
11588                         diradd_inode_written(WK_DIRADD(wk), inodedep);
11589                         continue;
11590
11591                 case D_FREEFRAG:
11592                         wk->wk_state |= COMPLETE;
11593                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11594                                 add_to_worklist(wk, 0);
11595                         continue;
11596
11597                 case D_DIRREM:
11598                         wk->wk_state |= COMPLETE;
11599                         add_to_worklist(wk, 0);
11600                         continue;
11601
11602                 case D_ALLOCDIRECT:
11603                 case D_ALLOCINDIR:
11604                         free_newblk(WK_NEWBLK(wk));
11605                         continue;
11606
11607                 case D_JNEWBLK:
11608                         wk->wk_state |= COMPLETE;
11609                         free_jnewblk(WK_JNEWBLK(wk));
11610                         continue;
11611
11612                 /*
11613                  * Save freed journal segments and add references on
11614                  * the supplied list which will delay their release
11615                  * until the cg bitmap is cleared on disk.
11616                  */
11617                 case D_JSEGDEP:
11618                         if (refhd == NULL)
11619                                 free_jsegdep(WK_JSEGDEP(wk));
11620                         else
11621                                 WORKLIST_INSERT(refhd, wk);
11622                         continue;
11623
11624                 case D_JADDREF:
11625                         jaddref = WK_JADDREF(wk);
11626                         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11627                             if_deps);
11628                         /*
11629                          * Transfer any jaddrefs to the list to be freed with
11630                          * the bitmap if we're handling a removed file.
11631                          */
11632                         if (refhd == NULL) {
11633                                 wk->wk_state |= COMPLETE;
11634                                 free_jaddref(jaddref);
11635                         } else
11636                                 WORKLIST_INSERT(refhd, wk);
11637                         continue;
11638
11639                 default:
11640                         panic("handle_bufwait: Unknown type %p(%s)",
11641                             wk, TYPENAME(wk->wk_type));
11642                         /* NOTREACHED */
11643                 }
11644         }
11645         return (freefile);
11646 }
11647 /*
11648  * Called from within softdep_disk_write_complete above to restore
11649  * in-memory inode block contents to their most up-to-date state. Note
11650  * that this routine is always called from interrupt level with further
11651  * interrupts from this device blocked.
11652  *
11653  * If the write did not succeed, we will do all the roll-forward
11654  * operations, but we will not take the actions that will allow its
11655  * dependencies to be processed.
11656  */
11657 static int
11658 handle_written_inodeblock(
11659         struct inodedep *inodedep,
11660         struct buf *bp,         /* buffer containing the inode block */
11661         int flags)
11662 {
11663         struct freefile *freefile;
11664         struct allocdirect *adp, *nextadp;
11665         struct ufs1_dinode *dp1 = NULL;
11666         struct ufs2_dinode *dp2 = NULL;
11667         struct workhead wkhd;
11668         int hadchanges, fstype;
11669         ino_t freelink;
11670
11671         LIST_INIT(&wkhd);
11672         hadchanges = 0;
11673         freefile = NULL;
11674         if ((inodedep->id_state & IOSTARTED) == 0)
11675                 panic("handle_written_inodeblock: not started");
11676         inodedep->id_state &= ~IOSTARTED;
11677         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11678                 fstype = UFS1;
11679                 dp1 = (struct ufs1_dinode *)bp->b_data +
11680                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11681                 freelink = dp1->di_freelink;
11682         } else {
11683                 fstype = UFS2;
11684                 dp2 = (struct ufs2_dinode *)bp->b_data +
11685                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11686                 freelink = dp2->di_freelink;
11687         }
11688         /*
11689          * Leave this inodeblock dirty until it's in the list.
11690          */
11691         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11692             (flags & WRITESUCCEEDED)) {
11693                 struct inodedep *inon;
11694
11695                 inon = TAILQ_NEXT(inodedep, id_unlinked);
11696                 if ((inon == NULL && freelink == 0) ||
11697                     (inon && inon->id_ino == freelink)) {
11698                         if (inon)
11699                                 inon->id_state |= UNLINKPREV;
11700                         inodedep->id_state |= UNLINKNEXT;
11701                 }
11702                 hadchanges = 1;
11703         }
11704         /*
11705          * If we had to rollback the inode allocation because of
11706          * bitmaps being incomplete, then simply restore it.
11707          * Keep the block dirty so that it will not be reclaimed until
11708          * all associated dependencies have been cleared and the
11709          * corresponding updates written to disk.
11710          */
11711         if (inodedep->id_savedino1 != NULL) {
11712                 hadchanges = 1;
11713                 if (fstype == UFS1)
11714                         *dp1 = *inodedep->id_savedino1;
11715                 else
11716                         *dp2 = *inodedep->id_savedino2;
11717                 free(inodedep->id_savedino1, M_SAVEDINO);
11718                 inodedep->id_savedino1 = NULL;
11719                 if ((bp->b_flags & B_DELWRI) == 0)
11720                         stat_inode_bitmap++;
11721                 bdirty(bp);
11722                 /*
11723                  * If the inode is clear here and GOINGAWAY it will never
11724                  * be written.  Process the bufwait and clear any pending
11725                  * work which may include the freefile.
11726                  */
11727                 if (inodedep->id_state & GOINGAWAY)
11728                         goto bufwait;
11729                 return (1);
11730         }
11731         if (flags & WRITESUCCEEDED)
11732                 inodedep->id_state |= COMPLETE;
11733         /*
11734          * Roll forward anything that had to be rolled back before
11735          * the inode could be updated.
11736          */
11737         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11738                 nextadp = TAILQ_NEXT(adp, ad_next);
11739                 if (adp->ad_state & ATTACHED)
11740                         panic("handle_written_inodeblock: new entry");
11741                 if (fstype == UFS1) {
11742                         if (adp->ad_offset < UFS_NDADDR) {
11743                                 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11744                                         panic("%s %s #%jd mismatch %d != %jd",
11745                                             "handle_written_inodeblock:",
11746                                             "direct pointer",
11747                                             (intmax_t)adp->ad_offset,
11748                                             dp1->di_db[adp->ad_offset],
11749                                             (intmax_t)adp->ad_oldblkno);
11750                                 dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11751                         } else {
11752                                 if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
11753                                     0)
11754                                         panic("%s: %s #%jd allocated as %d",
11755                                             "handle_written_inodeblock",
11756                                             "indirect pointer",
11757                                             (intmax_t)adp->ad_offset -
11758                                             UFS_NDADDR,
11759                                             dp1->di_ib[adp->ad_offset -
11760                                             UFS_NDADDR]);
11761                                 dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
11762                                     adp->ad_newblkno;
11763                         }
11764                 } else {
11765                         if (adp->ad_offset < UFS_NDADDR) {
11766                                 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11767                                         panic("%s: %s #%jd %s %jd != %jd",
11768                                             "handle_written_inodeblock",
11769                                             "direct pointer",
11770                                             (intmax_t)adp->ad_offset, "mismatch",
11771                                             (intmax_t)dp2->di_db[adp->ad_offset],
11772                                             (intmax_t)adp->ad_oldblkno);
11773                                 dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11774                         } else {
11775                                 if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
11776                                     0)
11777                                         panic("%s: %s #%jd allocated as %jd",
11778                                             "handle_written_inodeblock",
11779                                             "indirect pointer",
11780                                             (intmax_t)adp->ad_offset -
11781                                             UFS_NDADDR,
11782                                             (intmax_t)
11783                                             dp2->di_ib[adp->ad_offset -
11784                                             UFS_NDADDR]);
11785                                 dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
11786                                     adp->ad_newblkno;
11787                         }
11788                 }
11789                 adp->ad_state &= ~UNDONE;
11790                 adp->ad_state |= ATTACHED;
11791                 hadchanges = 1;
11792         }
11793         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11794                 nextadp = TAILQ_NEXT(adp, ad_next);
11795                 if (adp->ad_state & ATTACHED)
11796                         panic("handle_written_inodeblock: new entry");
11797                 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11798                         panic("%s: direct pointers #%jd %s %jd != %jd",
11799                             "handle_written_inodeblock",
11800                             (intmax_t)adp->ad_offset, "mismatch",
11801                             (intmax_t)dp2->di_extb[adp->ad_offset],
11802                             (intmax_t)adp->ad_oldblkno);
11803                 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11804                 adp->ad_state &= ~UNDONE;
11805                 adp->ad_state |= ATTACHED;
11806                 hadchanges = 1;
11807         }
11808         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11809                 stat_direct_blk_ptrs++;
11810         /*
11811          * Reset the file size to its most up-to-date value.
11812          */
11813         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11814                 panic("handle_written_inodeblock: bad size");
11815         if (inodedep->id_savednlink > UFS_LINK_MAX)
11816                 panic("handle_written_inodeblock: Invalid link count "
11817                     "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
11818                     inodedep);
11819         if (fstype == UFS1) {
11820                 if (dp1->di_nlink != inodedep->id_savednlink) {
11821                         dp1->di_nlink = inodedep->id_savednlink;
11822                         hadchanges = 1;
11823                 }
11824                 if (dp1->di_size != inodedep->id_savedsize) {
11825                         dp1->di_size = inodedep->id_savedsize;
11826                         hadchanges = 1;
11827                 }
11828         } else {
11829                 if (dp2->di_nlink != inodedep->id_savednlink) {
11830                         dp2->di_nlink = inodedep->id_savednlink;
11831                         hadchanges = 1;
11832                 }
11833                 if (dp2->di_size != inodedep->id_savedsize) {
11834                         dp2->di_size = inodedep->id_savedsize;
11835                         hadchanges = 1;
11836                 }
11837                 if (dp2->di_extsize != inodedep->id_savedextsize) {
11838                         dp2->di_extsize = inodedep->id_savedextsize;
11839                         hadchanges = 1;
11840                 }
11841         }
11842         inodedep->id_savedsize = -1;
11843         inodedep->id_savedextsize = -1;
11844         inodedep->id_savednlink = -1;
11845         /*
11846          * If there were any rollbacks in the inode block, then it must be
11847          * marked dirty so that its will eventually get written back in
11848          * its correct form.
11849          */
11850         if (hadchanges) {
11851                 if (fstype == UFS2)
11852                         ffs_update_dinode_ckhash(inodedep->id_fs, dp2);
11853                 bdirty(bp);
11854         }
11855 bufwait:
11856         /*
11857          * If the write did not succeed, we have done all the roll-forward
11858          * operations, but we cannot take the actions that will allow its
11859          * dependencies to be processed.
11860          */
11861         if ((flags & WRITESUCCEEDED) == 0)
11862                 return (hadchanges);
11863         /*
11864          * Process any allocdirects that completed during the update.
11865          */
11866         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11867                 handle_allocdirect_partdone(adp, &wkhd);
11868         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11869                 handle_allocdirect_partdone(adp, &wkhd);
11870         /*
11871          * Process deallocations that were held pending until the
11872          * inode had been written to disk. Freeing of the inode
11873          * is delayed until after all blocks have been freed to
11874          * avoid creation of new <vfsid, inum, lbn> triples
11875          * before the old ones have been deleted.  Completely
11876          * unlinked inodes are not processed until the unlinked
11877          * inode list is written or the last reference is removed.
11878          */
11879         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11880                 freefile = handle_bufwait(inodedep, NULL);
11881                 if (freefile && !LIST_EMPTY(&wkhd)) {
11882                         WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11883                         freefile = NULL;
11884                 }
11885         }
11886         /*
11887          * Move rolled forward dependency completions to the bufwait list
11888          * now that those that were already written have been processed.
11889          */
11890         if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11891                 panic("handle_written_inodeblock: bufwait but no changes");
11892         jwork_move(&inodedep->id_bufwait, &wkhd);
11893
11894         if (freefile != NULL) {
11895                 /*
11896                  * If the inode is goingaway it was never written.  Fake up
11897                  * the state here so free_inodedep() can succeed.
11898                  */
11899                 if (inodedep->id_state & GOINGAWAY)
11900                         inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11901                 if (free_inodedep(inodedep) == 0)
11902                         panic("handle_written_inodeblock: live inodedep %p",
11903                             inodedep);
11904                 add_to_worklist(&freefile->fx_list, 0);
11905                 return (0);
11906         }
11907
11908         /*
11909          * If no outstanding dependencies, free it.
11910          */
11911         if (free_inodedep(inodedep) ||
11912             (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11913              TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11914              TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11915              LIST_FIRST(&inodedep->id_bufwait) == 0))
11916                 return (0);
11917         return (hadchanges);
11918 }
11919
11920 /*
11921  * Perform needed roll-forwards and kick off any dependencies that
11922  * can now be processed.
11923  *
11924  * If the write did not succeed, we will do all the roll-forward
11925  * operations, but we will not take the actions that will allow its
11926  * dependencies to be processed.
11927  */
11928 static int
11929 handle_written_indirdep(
11930         struct indirdep *indirdep,
11931         struct buf *bp,
11932         struct buf **bpp,
11933         int flags)
11934 {
11935         struct allocindir *aip;
11936         struct buf *sbp;
11937         int chgs;
11938
11939         if (indirdep->ir_state & GOINGAWAY)
11940                 panic("handle_written_indirdep: indirdep gone");
11941         if ((indirdep->ir_state & IOSTARTED) == 0)
11942                 panic("handle_written_indirdep: IO not started");
11943         chgs = 0;
11944         /*
11945          * If there were rollbacks revert them here.
11946          */
11947         if (indirdep->ir_saveddata) {
11948                 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11949                 if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11950                         free(indirdep->ir_saveddata, M_INDIRDEP);
11951                         indirdep->ir_saveddata = NULL;
11952                 }
11953                 chgs = 1;
11954         }
11955         indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11956         indirdep->ir_state |= ATTACHED;
11957         /*
11958          * If the write did not succeed, we have done all the roll-forward
11959          * operations, but we cannot take the actions that will allow its
11960          * dependencies to be processed.
11961          */
11962         if ((flags & WRITESUCCEEDED) == 0) {
11963                 stat_indir_blk_ptrs++;
11964                 bdirty(bp);
11965                 return (1);
11966         }
11967         /*
11968          * Move allocindirs with written pointers to the completehd if
11969          * the indirdep's pointer is not yet written.  Otherwise
11970          * free them here.
11971          */
11972         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11973                 LIST_REMOVE(aip, ai_next);
11974                 if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11975                         LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11976                             ai_next);
11977                         newblk_freefrag(&aip->ai_block);
11978                         continue;
11979                 }
11980                 free_newblk(&aip->ai_block);
11981         }
11982         /*
11983          * Move allocindirs that have finished dependency processing from
11984          * the done list to the write list after updating the pointers.
11985          */
11986         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11987                 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11988                         handle_allocindir_partdone(aip);
11989                         if (aip == LIST_FIRST(&indirdep->ir_donehd))
11990                                 panic("disk_write_complete: not gone");
11991                         chgs = 1;
11992                 }
11993         }
11994         /*
11995          * Preserve the indirdep if there were any changes or if it is not
11996          * yet valid on disk.
11997          */
11998         if (chgs) {
11999                 stat_indir_blk_ptrs++;
12000                 bdirty(bp);
12001                 return (1);
12002         }
12003         /*
12004          * If there were no changes we can discard the savedbp and detach
12005          * ourselves from the buf.  We are only carrying completed pointers
12006          * in this case.
12007          */
12008         sbp = indirdep->ir_savebp;
12009         sbp->b_flags |= B_INVAL | B_NOCACHE;
12010         indirdep->ir_savebp = NULL;
12011         indirdep->ir_bp = NULL;
12012         if (*bpp != NULL)
12013                 panic("handle_written_indirdep: bp already exists.");
12014         *bpp = sbp;
12015         /*
12016          * The indirdep may not be freed until its parent points at it.
12017          */
12018         if (indirdep->ir_state & DEPCOMPLETE)
12019                 free_indirdep(indirdep);
12020
12021         return (0);
12022 }
12023
12024 /*
12025  * Process a diradd entry after its dependent inode has been written.
12026  */
12027 static void
12028 diradd_inode_written(
12029         struct diradd *dap,
12030         struct inodedep *inodedep)
12031 {
12032
12033         LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp));
12034         dap->da_state |= COMPLETE;
12035         complete_diradd(dap);
12036         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
12037 }
12038
12039 /*
12040  * Returns true if the bmsafemap will have rollbacks when written.  Must only
12041  * be called with the per-filesystem lock and the buf lock on the cg held.
12042  */
12043 static int
12044 bmsafemap_backgroundwrite(
12045         struct bmsafemap *bmsafemap,
12046         struct buf *bp)
12047 {
12048         int dirty;
12049
12050         LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
12051         dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
12052             !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
12053         /*
12054          * If we're initiating a background write we need to process the
12055          * rollbacks as they exist now, not as they exist when IO starts.
12056          * No other consumers will look at the contents of the shadowed
12057          * buf so this is safe to do here.
12058          */
12059         if (bp->b_xflags & BX_BKGRDMARKER)
12060                 initiate_write_bmsafemap(bmsafemap, bp);
12061
12062         return (dirty);
12063 }
12064
12065 /*
12066  * Re-apply an allocation when a cg write is complete.
12067  */
12068 static int
12069 jnewblk_rollforward(
12070         struct jnewblk *jnewblk,
12071         struct fs *fs,
12072         struct cg *cgp,
12073         uint8_t *blksfree)
12074 {
12075         ufs1_daddr_t fragno;
12076         ufs2_daddr_t blkno;
12077         long cgbno, bbase;
12078         int frags, blk;
12079         int i;
12080
12081         frags = 0;
12082         cgbno = dtogd(fs, jnewblk->jn_blkno);
12083         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
12084                 if (isclr(blksfree, cgbno + i))
12085                         panic("jnewblk_rollforward: re-allocated fragment");
12086                 frags++;
12087         }
12088         if (frags == fs->fs_frag) {
12089                 blkno = fragstoblks(fs, cgbno);
12090                 ffs_clrblock(fs, blksfree, (long)blkno);
12091                 ffs_clusteracct(fs, cgp, blkno, -1);
12092                 cgp->cg_cs.cs_nbfree--;
12093         } else {
12094                 bbase = cgbno - fragnum(fs, cgbno);
12095                 cgbno += jnewblk->jn_oldfrags;
12096                 /* If a complete block had been reassembled, account for it. */
12097                 fragno = fragstoblks(fs, bbase);
12098                 if (ffs_isblock(fs, blksfree, fragno)) {
12099                         cgp->cg_cs.cs_nffree += fs->fs_frag;
12100                         ffs_clusteracct(fs, cgp, fragno, -1);
12101                         cgp->cg_cs.cs_nbfree--;
12102                 }
12103                 /* Decrement the old frags.  */
12104                 blk = blkmap(fs, blksfree, bbase);
12105                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
12106                 /* Allocate the fragment */
12107                 for (i = 0; i < frags; i++)
12108                         clrbit(blksfree, cgbno + i);
12109                 cgp->cg_cs.cs_nffree -= frags;
12110                 /* Add back in counts associated with the new frags */
12111                 blk = blkmap(fs, blksfree, bbase);
12112                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
12113         }
12114         return (frags);
12115 }
12116
12117 /*
12118  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
12119  * changes if it's not a background write.  Set all written dependencies
12120  * to DEPCOMPLETE and free the structure if possible.
12121  *
12122  * If the write did not succeed, we will do all the roll-forward
12123  * operations, but we will not take the actions that will allow its
12124  * dependencies to be processed.
12125  */
12126 static int
12127 handle_written_bmsafemap(
12128         struct bmsafemap *bmsafemap,
12129         struct buf *bp,
12130         int flags)
12131 {
12132         struct newblk *newblk;
12133         struct inodedep *inodedep;
12134         struct jaddref *jaddref, *jatmp;
12135         struct jnewblk *jnewblk, *jntmp;
12136         struct ufsmount *ump;
12137         uint8_t *inosused;
12138         uint8_t *blksfree;
12139         struct cg *cgp;
12140         struct fs *fs;
12141         ino_t ino;
12142         int foreground;
12143         int chgs;
12144
12145         if ((bmsafemap->sm_state & IOSTARTED) == 0)
12146                 panic("handle_written_bmsafemap: Not started\n");
12147         ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
12148         chgs = 0;
12149         bmsafemap->sm_state &= ~IOSTARTED;
12150         foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
12151         /*
12152          * If write was successful, release journal work that was waiting
12153          * on the write. Otherwise move the work back.
12154          */
12155         if (flags & WRITESUCCEEDED)
12156                 handle_jwork(&bmsafemap->sm_freewr);
12157         else
12158                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12159                     worklist, wk_list);
12160
12161         /*
12162          * Restore unwritten inode allocation pending jaddref writes.
12163          */
12164         if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
12165                 cgp = (struct cg *)bp->b_data;
12166                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
12167                 inosused = cg_inosused(cgp);
12168                 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
12169                     ja_bmdeps, jatmp) {
12170                         if ((jaddref->ja_state & UNDONE) == 0)
12171                                 continue;
12172                         ino = jaddref->ja_ino % fs->fs_ipg;
12173                         if (isset(inosused, ino))
12174                                 panic("handle_written_bmsafemap: "
12175                                     "re-allocated inode");
12176                         /* Do the roll-forward only if it's a real copy. */
12177                         if (foreground) {
12178                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
12179                                         cgp->cg_cs.cs_ndir++;
12180                                 cgp->cg_cs.cs_nifree--;
12181                                 setbit(inosused, ino);
12182                                 chgs = 1;
12183                         }
12184                         jaddref->ja_state &= ~UNDONE;
12185                         jaddref->ja_state |= ATTACHED;
12186                         free_jaddref(jaddref);
12187                 }
12188         }
12189         /*
12190          * Restore any block allocations which are pending journal writes.
12191          */
12192         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
12193                 cgp = (struct cg *)bp->b_data;
12194                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
12195                 blksfree = cg_blksfree(cgp);
12196                 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
12197                     jntmp) {
12198                         if ((jnewblk->jn_state & UNDONE) == 0)
12199                                 continue;
12200                         /* Do the roll-forward only if it's a real copy. */
12201                         if (foreground &&
12202                             jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
12203                                 chgs = 1;
12204                         jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
12205                         jnewblk->jn_state |= ATTACHED;
12206                         free_jnewblk(jnewblk);
12207                 }
12208         }
12209         /*
12210          * If the write did not succeed, we have done all the roll-forward
12211          * operations, but we cannot take the actions that will allow its
12212          * dependencies to be processed.
12213          */
12214         if ((flags & WRITESUCCEEDED) == 0) {
12215                 LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
12216                     newblk, nb_deps);
12217                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12218                     worklist, wk_list);
12219                 if (foreground)
12220                         bdirty(bp);
12221                 return (1);
12222         }
12223         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
12224                 newblk->nb_state |= DEPCOMPLETE;
12225                 newblk->nb_state &= ~ONDEPLIST;
12226                 newblk->nb_bmsafemap = NULL;
12227                 LIST_REMOVE(newblk, nb_deps);
12228                 if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
12229                         handle_allocdirect_partdone(
12230                             WK_ALLOCDIRECT(&newblk->nb_list), NULL);
12231                 else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
12232                         handle_allocindir_partdone(
12233                             WK_ALLOCINDIR(&newblk->nb_list));
12234                 else if (newblk->nb_list.wk_type != D_NEWBLK)
12235                         panic("handle_written_bmsafemap: Unexpected type: %s",
12236                             TYPENAME(newblk->nb_list.wk_type));
12237         }
12238         while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
12239                 inodedep->id_state |= DEPCOMPLETE;
12240                 inodedep->id_state &= ~ONDEPLIST;
12241                 LIST_REMOVE(inodedep, id_deps);
12242                 inodedep->id_bmsafemap = NULL;
12243         }
12244         LIST_REMOVE(bmsafemap, sm_next);
12245         if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
12246             LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
12247             LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
12248             LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
12249             LIST_EMPTY(&bmsafemap->sm_freehd)) {
12250                 LIST_REMOVE(bmsafemap, sm_hash);
12251                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
12252                 return (0);
12253         }
12254         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
12255         if (foreground)
12256                 bdirty(bp);
12257         return (1);
12258 }
12259
12260 /*
12261  * Try to free a mkdir dependency.
12262  */
12263 static void
12264 complete_mkdir(struct mkdir *mkdir)
12265 {
12266         struct diradd *dap;
12267
12268         if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
12269                 return;
12270         LIST_REMOVE(mkdir, md_mkdirs);
12271         dap = mkdir->md_diradd;
12272         dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
12273         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
12274                 dap->da_state |= DEPCOMPLETE;
12275                 complete_diradd(dap);
12276         }
12277         WORKITEM_FREE(mkdir, D_MKDIR);
12278 }
12279
12280 /*
12281  * Handle the completion of a mkdir dependency.
12282  */
12283 static void
12284 handle_written_mkdir(struct mkdir *mkdir, int type)
12285 {
12286
12287         if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
12288                 panic("handle_written_mkdir: bad type");
12289         mkdir->md_state |= COMPLETE;
12290         complete_mkdir(mkdir);
12291 }
12292
12293 static int
12294 free_pagedep(struct pagedep *pagedep)
12295 {
12296         int i;
12297
12298         if (pagedep->pd_state & NEWBLOCK)
12299                 return (0);
12300         if (!LIST_EMPTY(&pagedep->pd_dirremhd))
12301                 return (0);
12302         for (i = 0; i < DAHASHSZ; i++)
12303                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
12304                         return (0);
12305         if (!LIST_EMPTY(&pagedep->pd_pendinghd))
12306                 return (0);
12307         if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
12308                 return (0);
12309         if (pagedep->pd_state & ONWORKLIST)
12310                 WORKLIST_REMOVE(&pagedep->pd_list);
12311         LIST_REMOVE(pagedep, pd_hash);
12312         WORKITEM_FREE(pagedep, D_PAGEDEP);
12313
12314         return (1);
12315 }
12316
12317 /*
12318  * Called from within softdep_disk_write_complete above.
12319  * A write operation was just completed. Removed inodes can
12320  * now be freed and associated block pointers may be committed.
12321  * Note that this routine is always called from interrupt level
12322  * with further interrupts from this device blocked.
12323  *
12324  * If the write did not succeed, we will do all the roll-forward
12325  * operations, but we will not take the actions that will allow its
12326  * dependencies to be processed.
12327  */
12328 static int
12329 handle_written_filepage(
12330         struct pagedep *pagedep,
12331         struct buf *bp,         /* buffer containing the written page */
12332         int flags)
12333 {
12334         struct dirrem *dirrem;
12335         struct diradd *dap, *nextdap;
12336         struct direct *ep;
12337         int i, chgs;
12338
12339         if ((pagedep->pd_state & IOSTARTED) == 0)
12340                 panic("handle_written_filepage: not started");
12341         pagedep->pd_state &= ~IOSTARTED;
12342         if ((flags & WRITESUCCEEDED) == 0)
12343                 goto rollforward;
12344         /*
12345          * Process any directory removals that have been committed.
12346          */
12347         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12348                 LIST_REMOVE(dirrem, dm_next);
12349                 dirrem->dm_state |= COMPLETE;
12350                 dirrem->dm_dirinum = pagedep->pd_ino;
12351                 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12352                     ("handle_written_filepage: Journal entries not written."));
12353                 add_to_worklist(&dirrem->dm_list, 0);
12354         }
12355         /*
12356          * Free any directory additions that have been committed.
12357          * If it is a newly allocated block, we have to wait until
12358          * the on-disk directory inode claims the new block.
12359          */
12360         if ((pagedep->pd_state & NEWBLOCK) == 0)
12361                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12362                         free_diradd(dap, NULL);
12363 rollforward:
12364         /*
12365          * Uncommitted directory entries must be restored.
12366          */
12367         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12368                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12369                      dap = nextdap) {
12370                         nextdap = LIST_NEXT(dap, da_pdlist);
12371                         if (dap->da_state & ATTACHED)
12372                                 panic("handle_written_filepage: attached");
12373                         ep = (struct direct *)
12374                             ((char *)bp->b_data + dap->da_offset);
12375                         ep->d_ino = dap->da_newinum;
12376                         dap->da_state &= ~UNDONE;
12377                         dap->da_state |= ATTACHED;
12378                         chgs = 1;
12379                         /*
12380                          * If the inode referenced by the directory has
12381                          * been written out, then the dependency can be
12382                          * moved to the pending list.
12383                          */
12384                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12385                                 LIST_REMOVE(dap, da_pdlist);
12386                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12387                                     da_pdlist);
12388                         }
12389                 }
12390         }
12391         /*
12392          * If there were any rollbacks in the directory, then it must be
12393          * marked dirty so that its will eventually get written back in
12394          * its correct form.
12395          */
12396         if (chgs || (flags & WRITESUCCEEDED) == 0) {
12397                 if ((bp->b_flags & B_DELWRI) == 0)
12398                         stat_dir_entry++;
12399                 bdirty(bp);
12400                 return (1);
12401         }
12402         /*
12403          * If we are not waiting for a new directory block to be
12404          * claimed by its inode, then the pagedep will be freed.
12405          * Otherwise it will remain to track any new entries on
12406          * the page in case they are fsync'ed.
12407          */
12408         free_pagedep(pagedep);
12409         return (0);
12410 }
12411
12412 /*
12413  * Writing back in-core inode structures.
12414  *
12415  * The filesystem only accesses an inode's contents when it occupies an
12416  * "in-core" inode structure.  These "in-core" structures are separate from
12417  * the page frames used to cache inode blocks.  Only the latter are
12418  * transferred to/from the disk.  So, when the updated contents of the
12419  * "in-core" inode structure are copied to the corresponding in-memory inode
12420  * block, the dependencies are also transferred.  The following procedure is
12421  * called when copying a dirty "in-core" inode to a cached inode block.
12422  */
12423
12424 /*
12425  * Called when an inode is loaded from disk. If the effective link count
12426  * differed from the actual link count when it was last flushed, then we
12427  * need to ensure that the correct effective link count is put back.
12428  */
12429 void
12430 softdep_load_inodeblock(
12431         struct inode *ip)       /* the "in_core" copy of the inode */
12432 {
12433         struct inodedep *inodedep;
12434         struct ufsmount *ump;
12435
12436         ump = ITOUMP(ip);
12437         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
12438             ("softdep_load_inodeblock called on non-softdep filesystem"));
12439         /*
12440          * Check for alternate nlink count.
12441          */
12442         ip->i_effnlink = ip->i_nlink;
12443         ACQUIRE_LOCK(ump);
12444         if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
12445                 FREE_LOCK(ump);
12446                 return;
12447         }
12448         if (ip->i_nlink != inodedep->id_nlinkwrote &&
12449             inodedep->id_nlinkwrote != -1) {
12450                 KASSERT(ip->i_nlink == 0 &&
12451                     (ump->um_flags & UM_FSFAIL_CLEANUP) != 0,
12452                     ("read bad i_nlink value"));
12453                 ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote;
12454         }
12455         ip->i_effnlink -= inodedep->id_nlinkdelta;
12456         KASSERT(ip->i_effnlink >= 0,
12457             ("softdep_load_inodeblock: negative i_effnlink"));
12458         FREE_LOCK(ump);
12459 }
12460
12461 /*
12462  * This routine is called just before the "in-core" inode
12463  * information is to be copied to the in-memory inode block.
12464  * Recall that an inode block contains several inodes. If
12465  * the force flag is set, then the dependencies will be
12466  * cleared so that the update can always be made. Note that
12467  * the buffer is locked when this routine is called, so we
12468  * will never be in the middle of writing the inode block
12469  * to disk.
12470  */
12471 void
12472 softdep_update_inodeblock(
12473         struct inode *ip,       /* the "in_core" copy of the inode */
12474         struct buf *bp,         /* the buffer containing the inode block */
12475         int waitfor)            /* nonzero => update must be allowed */
12476 {
12477         struct inodedep *inodedep;
12478         struct inoref *inoref;
12479         struct ufsmount *ump;
12480         struct worklist *wk;
12481         struct mount *mp;
12482         struct buf *ibp;
12483         struct fs *fs;
12484         int error;
12485
12486         ump = ITOUMP(ip);
12487         mp = UFSTOVFS(ump);
12488         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12489             ("softdep_update_inodeblock called on non-softdep filesystem"));
12490         fs = ump->um_fs;
12491         /*
12492          * If the effective link count is not equal to the actual link
12493          * count, then we must track the difference in an inodedep while
12494          * the inode is (potentially) tossed out of the cache. Otherwise,
12495          * if there is no existing inodedep, then there are no dependencies
12496          * to track.
12497          */
12498         ACQUIRE_LOCK(ump);
12499 again:
12500         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12501                 FREE_LOCK(ump);
12502                 if (ip->i_effnlink != ip->i_nlink)
12503                         panic("softdep_update_inodeblock: bad link count");
12504                 return;
12505         }
12506         /*
12507          * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12508          * does not have access to the in-core ip so must write directly into
12509          * the inode block buffer when setting freelink.
12510          */
12511         if ((inodedep->id_state & UNLINKED) != 0) {
12512                 if (fs->fs_magic == FS_UFS1_MAGIC)
12513                         DIP_SET(ip, i_freelink,
12514                             ((struct ufs1_dinode *)bp->b_data +
12515                             ino_to_fsbo(fs, ip->i_number))->di_freelink);
12516                 else
12517                         DIP_SET(ip, i_freelink,
12518                             ((struct ufs2_dinode *)bp->b_data +
12519                             ino_to_fsbo(fs, ip->i_number))->di_freelink);
12520         }
12521         KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta,
12522             ("softdep_update_inodeblock inconsistent ip %p i_nlink %d "
12523             "inodedep %p id_nlinkdelta %jd",
12524             ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta));
12525         inodedep->id_nlinkwrote = ip->i_nlink;
12526         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12527                 panic("softdep_update_inodeblock: bad delta");
12528         /*
12529          * If we're flushing all dependencies we must also move any waiting
12530          * for journal writes onto the bufwait list prior to I/O.
12531          */
12532         if (waitfor) {
12533                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12534                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12535                             == DEPCOMPLETE) {
12536                                 jwait(&inoref->if_list, MNT_WAIT);
12537                                 goto again;
12538                         }
12539                 }
12540         }
12541         /*
12542          * Changes have been initiated. Anything depending on these
12543          * changes cannot occur until this inode has been written.
12544          */
12545         inodedep->id_state &= ~COMPLETE;
12546         if ((inodedep->id_state & ONWORKLIST) == 0)
12547                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12548         /*
12549          * Any new dependencies associated with the incore inode must
12550          * now be moved to the list associated with the buffer holding
12551          * the in-memory copy of the inode. Once merged process any
12552          * allocdirects that are completed by the merger.
12553          */
12554         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12555         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12556                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12557                     NULL);
12558         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12559         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12560                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12561                     NULL);
12562         /*
12563          * Now that the inode has been pushed into the buffer, the
12564          * operations dependent on the inode being written to disk
12565          * can be moved to the id_bufwait so that they will be
12566          * processed when the buffer I/O completes.
12567          */
12568         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12569                 WORKLIST_REMOVE(wk);
12570                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12571         }
12572         /*
12573          * Newly allocated inodes cannot be written until the bitmap
12574          * that allocates them have been written (indicated by
12575          * DEPCOMPLETE being set in id_state). If we are doing a
12576          * forced sync (e.g., an fsync on a file), we force the bitmap
12577          * to be written so that the update can be done.
12578          */
12579         if (waitfor == 0) {
12580                 FREE_LOCK(ump);
12581                 return;
12582         }
12583 retry:
12584         if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12585                 FREE_LOCK(ump);
12586                 return;
12587         }
12588         ibp = inodedep->id_bmsafemap->sm_buf;
12589         ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12590         if (ibp == NULL) {
12591                 /*
12592                  * If ibp came back as NULL, the dependency could have been
12593                  * freed while we slept.  Look it up again, and check to see
12594                  * that it has completed.
12595                  */
12596                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12597                         goto retry;
12598                 FREE_LOCK(ump);
12599                 return;
12600         }
12601         FREE_LOCK(ump);
12602         if ((error = bwrite(ibp)) != 0)
12603                 softdep_error("softdep_update_inodeblock: bwrite", error);
12604 }
12605
12606 /*
12607  * Merge the a new inode dependency list (such as id_newinoupdt) into an
12608  * old inode dependency list (such as id_inoupdt).
12609  */
12610 static void
12611 merge_inode_lists(
12612         struct allocdirectlst *newlisthead,
12613         struct allocdirectlst *oldlisthead)
12614 {
12615         struct allocdirect *listadp, *newadp;
12616
12617         newadp = TAILQ_FIRST(newlisthead);
12618         if (newadp != NULL)
12619                 LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp));
12620         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12621                 if (listadp->ad_offset < newadp->ad_offset) {
12622                         listadp = TAILQ_NEXT(listadp, ad_next);
12623                         continue;
12624                 }
12625                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12626                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12627                 if (listadp->ad_offset == newadp->ad_offset) {
12628                         allocdirect_merge(oldlisthead, newadp,
12629                             listadp);
12630                         listadp = newadp;
12631                 }
12632                 newadp = TAILQ_FIRST(newlisthead);
12633         }
12634         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12635                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12636                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12637         }
12638 }
12639
12640 /*
12641  * If we are doing an fsync, then we must ensure that any directory
12642  * entries for the inode have been written after the inode gets to disk.
12643  */
12644 int
12645 softdep_fsync(
12646         struct vnode *vp)       /* the "in_core" copy of the inode */
12647 {
12648         struct inodedep *inodedep;
12649         struct pagedep *pagedep;
12650         struct inoref *inoref;
12651         struct ufsmount *ump;
12652         struct worklist *wk;
12653         struct diradd *dap;
12654         struct mount *mp;
12655         struct vnode *pvp;
12656         struct inode *ip;
12657         struct buf *bp;
12658         struct fs *fs;
12659         struct thread *td = curthread;
12660         int error, flushparent, pagedep_new_block;
12661         ino_t parentino;
12662         ufs_lbn_t lbn;
12663
12664         ip = VTOI(vp);
12665         mp = vp->v_mount;
12666         ump = VFSTOUFS(mp);
12667         fs = ump->um_fs;
12668         if (MOUNTEDSOFTDEP(mp) == 0)
12669                 return (0);
12670         ACQUIRE_LOCK(ump);
12671 restart:
12672         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12673                 FREE_LOCK(ump);
12674                 return (0);
12675         }
12676         TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12677                 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12678                     == DEPCOMPLETE) {
12679                         jwait(&inoref->if_list, MNT_WAIT);
12680                         goto restart;
12681                 }
12682         }
12683         if (!LIST_EMPTY(&inodedep->id_inowait) ||
12684             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12685             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12686             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12687             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12688                 panic("softdep_fsync: pending ops %p", inodedep);
12689         for (error = 0, flushparent = 0; ; ) {
12690                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12691                         break;
12692                 if (wk->wk_type != D_DIRADD)
12693                         panic("softdep_fsync: Unexpected type %s",
12694                             TYPENAME(wk->wk_type));
12695                 dap = WK_DIRADD(wk);
12696                 /*
12697                  * Flush our parent if this directory entry has a MKDIR_PARENT
12698                  * dependency or is contained in a newly allocated block.
12699                  */
12700                 if (dap->da_state & DIRCHG)
12701                         pagedep = dap->da_previous->dm_pagedep;
12702                 else
12703                         pagedep = dap->da_pagedep;
12704                 parentino = pagedep->pd_ino;
12705                 lbn = pagedep->pd_lbn;
12706                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12707                         panic("softdep_fsync: dirty");
12708                 if ((dap->da_state & MKDIR_PARENT) ||
12709                     (pagedep->pd_state & NEWBLOCK))
12710                         flushparent = 1;
12711                 else
12712                         flushparent = 0;
12713                 /*
12714                  * If we are being fsync'ed as part of vgone'ing this vnode,
12715                  * then we will not be able to release and recover the
12716                  * vnode below, so we just have to give up on writing its
12717                  * directory entry out. It will eventually be written, just
12718                  * not now, but then the user was not asking to have it
12719                  * written, so we are not breaking any promises.
12720                  */
12721                 if (VN_IS_DOOMED(vp))
12722                         break;
12723                 /*
12724                  * We prevent deadlock by always fetching inodes from the
12725                  * root, moving down the directory tree. Thus, when fetching
12726                  * our parent directory, we first try to get the lock. If
12727                  * that fails, we must unlock ourselves before requesting
12728                  * the lock on our parent. See the comment in ufs_lookup
12729                  * for details on possible races.
12730                  */
12731                 FREE_LOCK(ump);
12732                 error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL,
12733                     &pvp);
12734                 if (error == ERELOOKUP)
12735                         error = 0;
12736                 if (error != 0)
12737                         return (error);
12738                 /*
12739                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12740                  * that are contained in direct blocks will be resolved by
12741                  * doing a ffs_update. Pagedeps contained in indirect blocks
12742                  * may require a complete sync'ing of the directory. So, we
12743                  * try the cheap and fast ffs_update first, and if that fails,
12744                  * then we do the slower ffs_syncvnode of the directory.
12745                  */
12746                 if (flushparent) {
12747                         int locked;
12748
12749                         if ((error = ffs_update(pvp, 1)) != 0) {
12750                                 vput(pvp);
12751                                 return (error);
12752                         }
12753                         ACQUIRE_LOCK(ump);
12754                         locked = 1;
12755                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12756                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12757                                         if (wk->wk_type != D_DIRADD)
12758                                                 panic("softdep_fsync: Unexpected type %s",
12759                                                       TYPENAME(wk->wk_type));
12760                                         dap = WK_DIRADD(wk);
12761                                         if (dap->da_state & DIRCHG)
12762                                                 pagedep = dap->da_previous->dm_pagedep;
12763                                         else
12764                                                 pagedep = dap->da_pagedep;
12765                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12766                                         FREE_LOCK(ump);
12767                                         locked = 0;
12768                                         if (pagedep_new_block) {
12769                                                 VOP_UNLOCK(vp);
12770                                                 error = ffs_syncvnode(pvp,
12771                                                     MNT_WAIT, 0);
12772                                                 if (error == 0)
12773                                                         error = ERELOOKUP;
12774                                                 vput(pvp);
12775                                                 vn_lock(vp, LK_EXCLUSIVE |
12776                                                     LK_RETRY);
12777                                                 return (error);
12778                                         }
12779                                 }
12780                         }
12781                         if (locked)
12782                                 FREE_LOCK(ump);
12783                 }
12784                 /*
12785                  * Flush directory page containing the inode's name.
12786                  */
12787                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12788                     &bp);
12789                 if (error == 0)
12790                         error = bwrite(bp);
12791                 else
12792                         brelse(bp);
12793                 vput(pvp);
12794                 if (!ffs_fsfail_cleanup(ump, error))
12795                         return (error);
12796                 ACQUIRE_LOCK(ump);
12797                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12798                         break;
12799         }
12800         FREE_LOCK(ump);
12801         return (0);
12802 }
12803
12804 /*
12805  * Flush all the dirty bitmaps associated with the block device
12806  * before flushing the rest of the dirty blocks so as to reduce
12807  * the number of dependencies that will have to be rolled back.
12808  *
12809  * XXX Unused?
12810  */
12811 void
12812 softdep_fsync_mountdev(struct vnode *vp)
12813 {
12814         struct buf *bp, *nbp;
12815         struct worklist *wk;
12816         struct bufobj *bo;
12817
12818         if (!vn_isdisk(vp))
12819                 panic("softdep_fsync_mountdev: vnode not a disk");
12820         bo = &vp->v_bufobj;
12821 restart:
12822         BO_LOCK(bo);
12823         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12824                 /*
12825                  * If it is already scheduled, skip to the next buffer.
12826                  */
12827                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12828                         continue;
12829
12830                 if ((bp->b_flags & B_DELWRI) == 0)
12831                         panic("softdep_fsync_mountdev: not dirty");
12832                 /*
12833                  * We are only interested in bitmaps with outstanding
12834                  * dependencies.
12835                  */
12836                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12837                     wk->wk_type != D_BMSAFEMAP ||
12838                     (bp->b_vflags & BV_BKGRDINPROG)) {
12839                         BUF_UNLOCK(bp);
12840                         continue;
12841                 }
12842                 BO_UNLOCK(bo);
12843                 bremfree(bp);
12844                 (void) bawrite(bp);
12845                 goto restart;
12846         }
12847         drain_output(vp);
12848         BO_UNLOCK(bo);
12849 }
12850
12851 /*
12852  * Sync all cylinder groups that were dirty at the time this function is
12853  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12854  * is used to flush freedep activity that may be holding up writes to a
12855  * indirect block.
12856  */
12857 static int
12858 sync_cgs(struct mount *mp, int waitfor)
12859 {
12860         struct bmsafemap *bmsafemap;
12861         struct bmsafemap *sentinel;
12862         struct ufsmount *ump;
12863         struct buf *bp;
12864         int error;
12865
12866         sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12867         sentinel->sm_cg = -1;
12868         ump = VFSTOUFS(mp);
12869         error = 0;
12870         ACQUIRE_LOCK(ump);
12871         LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12872         for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12873             bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12874                 /* Skip sentinels and cgs with no work to release. */
12875                 if (bmsafemap->sm_cg == -1 ||
12876                     (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12877                     LIST_EMPTY(&bmsafemap->sm_freewr))) {
12878                         LIST_REMOVE(sentinel, sm_next);
12879                         LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12880                         continue;
12881                 }
12882                 /*
12883                  * If we don't get the lock and we're waiting try again, if
12884                  * not move on to the next buf and try to sync it.
12885                  */
12886                 bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12887                 if (bp == NULL && waitfor == MNT_WAIT)
12888                         continue;
12889                 LIST_REMOVE(sentinel, sm_next);
12890                 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12891                 if (bp == NULL)
12892                         continue;
12893                 FREE_LOCK(ump);
12894                 if (waitfor == MNT_NOWAIT)
12895                         bawrite(bp);
12896                 else
12897                         error = bwrite(bp);
12898                 ACQUIRE_LOCK(ump);
12899                 if (error)
12900                         break;
12901         }
12902         LIST_REMOVE(sentinel, sm_next);
12903         FREE_LOCK(ump);
12904         free(sentinel, M_BMSAFEMAP);
12905         return (error);
12906 }
12907
12908 /*
12909  * This routine is called when we are trying to synchronously flush a
12910  * file. This routine must eliminate any filesystem metadata dependencies
12911  * so that the syncing routine can succeed.
12912  */
12913 int
12914 softdep_sync_metadata(struct vnode *vp)
12915 {
12916         struct inode *ip;
12917         int error;
12918
12919         ip = VTOI(vp);
12920         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12921             ("softdep_sync_metadata called on non-softdep filesystem"));
12922         /*
12923          * Ensure that any direct block dependencies have been cleared,
12924          * truncations are started, and inode references are journaled.
12925          */
12926         ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
12927         /*
12928          * Write all journal records to prevent rollbacks on devvp.
12929          */
12930         if (vp->v_type == VCHR)
12931                 softdep_flushjournal(vp->v_mount);
12932         error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12933         /*
12934          * Ensure that all truncates are written so we won't find deps on
12935          * indirect blocks.
12936          */
12937         process_truncates(vp);
12938         FREE_LOCK(VFSTOUFS(vp->v_mount));
12939
12940         return (error);
12941 }
12942
12943 /*
12944  * This routine is called when we are attempting to sync a buf with
12945  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12946  * other IO it can but returns EBUSY if the buffer is not yet able to
12947  * be written.  Dependencies which will not cause rollbacks will always
12948  * return 0.
12949  */
12950 int
12951 softdep_sync_buf(struct vnode *vp,
12952         struct buf *bp,
12953         int waitfor)
12954 {
12955         struct indirdep *indirdep;
12956         struct pagedep *pagedep;
12957         struct allocindir *aip;
12958         struct newblk *newblk;
12959         struct ufsmount *ump;
12960         struct buf *nbp;
12961         struct worklist *wk;
12962         int i, error;
12963
12964         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12965             ("softdep_sync_buf called on non-softdep filesystem"));
12966         /*
12967          * For VCHR we just don't want to force flush any dependencies that
12968          * will cause rollbacks.
12969          */
12970         if (vp->v_type == VCHR) {
12971                 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12972                         return (EBUSY);
12973                 return (0);
12974         }
12975         ump = VFSTOUFS(vp->v_mount);
12976         ACQUIRE_LOCK(ump);
12977         /*
12978          * As we hold the buffer locked, none of its dependencies
12979          * will disappear.
12980          */
12981         error = 0;
12982 top:
12983         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12984                 switch (wk->wk_type) {
12985                 case D_ALLOCDIRECT:
12986                 case D_ALLOCINDIR:
12987                         newblk = WK_NEWBLK(wk);
12988                         if (newblk->nb_jnewblk != NULL) {
12989                                 if (waitfor == MNT_NOWAIT) {
12990                                         error = EBUSY;
12991                                         goto out_unlock;
12992                                 }
12993                                 jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12994                                 goto top;
12995                         }
12996                         if (newblk->nb_state & DEPCOMPLETE ||
12997                             waitfor == MNT_NOWAIT)
12998                                 continue;
12999                         nbp = newblk->nb_bmsafemap->sm_buf;
13000                         nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
13001                         if (nbp == NULL)
13002                                 goto top;
13003                         FREE_LOCK(ump);
13004                         if ((error = bwrite(nbp)) != 0)
13005                                 goto out;
13006                         ACQUIRE_LOCK(ump);
13007                         continue;
13008
13009                 case D_INDIRDEP:
13010                         indirdep = WK_INDIRDEP(wk);
13011                         if (waitfor == MNT_NOWAIT) {
13012                                 if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
13013                                     !LIST_EMPTY(&indirdep->ir_deplisthd)) {
13014                                         error = EBUSY;
13015                                         goto out_unlock;
13016                                 }
13017                         }
13018                         if (!TAILQ_EMPTY(&indirdep->ir_trunc))
13019                                 panic("softdep_sync_buf: truncation pending.");
13020                 restart:
13021                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13022                                 newblk = (struct newblk *)aip;
13023                                 if (newblk->nb_jnewblk != NULL) {
13024                                         jwait(&newblk->nb_jnewblk->jn_list,
13025                                             waitfor);
13026                                         goto restart;
13027                                 }
13028                                 if (newblk->nb_state & DEPCOMPLETE)
13029                                         continue;
13030                                 nbp = newblk->nb_bmsafemap->sm_buf;
13031                                 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
13032                                 if (nbp == NULL)
13033                                         goto restart;
13034                                 FREE_LOCK(ump);
13035                                 if ((error = bwrite(nbp)) != 0)
13036                                         goto out;
13037                                 ACQUIRE_LOCK(ump);
13038                                 goto restart;
13039                         }
13040                         continue;
13041
13042                 case D_PAGEDEP:
13043                         /*
13044                          * Only flush directory entries in synchronous passes.
13045                          */
13046                         if (waitfor != MNT_WAIT) {
13047                                 error = EBUSY;
13048                                 goto out_unlock;
13049                         }
13050                         /*
13051                          * While syncing snapshots, we must allow recursive
13052                          * lookups.
13053                          */
13054                         BUF_AREC(bp);
13055                         /*
13056                          * We are trying to sync a directory that may
13057                          * have dependencies on both its own metadata
13058                          * and/or dependencies on the inodes of any
13059                          * recently allocated files. We walk its diradd
13060                          * lists pushing out the associated inode.
13061                          */
13062                         pagedep = WK_PAGEDEP(wk);
13063                         for (i = 0; i < DAHASHSZ; i++) {
13064                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
13065                                         continue;
13066                                 error = flush_pagedep_deps(vp, wk->wk_mp,
13067                                     &pagedep->pd_diraddhd[i], bp);
13068                                 if (error != 0) {
13069                                         if (error != ERELOOKUP)
13070                                                 BUF_NOREC(bp);
13071                                         goto out_unlock;
13072                                 }
13073                         }
13074                         BUF_NOREC(bp);
13075                         continue;
13076
13077                 case D_FREEWORK:
13078                 case D_FREEDEP:
13079                 case D_JSEGDEP:
13080                 case D_JNEWBLK:
13081                         continue;
13082
13083                 default:
13084                         panic("softdep_sync_buf: Unknown type %s",
13085                             TYPENAME(wk->wk_type));
13086                         /* NOTREACHED */
13087                 }
13088         }
13089 out_unlock:
13090         FREE_LOCK(ump);
13091 out:
13092         return (error);
13093 }
13094
13095 /*
13096  * Flush the dependencies associated with an inodedep.
13097  */
13098 static int
13099 flush_inodedep_deps(
13100         struct vnode *vp,
13101         struct mount *mp,
13102         ino_t ino)
13103 {
13104         struct inodedep *inodedep;
13105         struct inoref *inoref;
13106         struct ufsmount *ump;
13107         int error, waitfor;
13108
13109         /*
13110          * This work is done in two passes. The first pass grabs most
13111          * of the buffers and begins asynchronously writing them. The
13112          * only way to wait for these asynchronous writes is to sleep
13113          * on the filesystem vnode which may stay busy for a long time
13114          * if the filesystem is active. So, instead, we make a second
13115          * pass over the dependencies blocking on each write. In the
13116          * usual case we will be blocking against a write that we
13117          * initiated, so when it is done the dependency will have been
13118          * resolved. Thus the second pass is expected to end quickly.
13119          * We give a brief window at the top of the loop to allow
13120          * any pending I/O to complete.
13121          */
13122         ump = VFSTOUFS(mp);
13123         LOCK_OWNED(ump);
13124         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
13125                 if (error)
13126                         return (error);
13127                 FREE_LOCK(ump);
13128                 ACQUIRE_LOCK(ump);
13129 restart:
13130                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13131                         return (0);
13132                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13133                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13134                             == DEPCOMPLETE) {
13135                                 jwait(&inoref->if_list, MNT_WAIT);
13136                                 goto restart;
13137                         }
13138                 }
13139                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
13140                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
13141                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
13142                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
13143                         continue;
13144                 /*
13145                  * If pass2, we are done, otherwise do pass 2.
13146                  */
13147                 if (waitfor == MNT_WAIT)
13148                         break;
13149                 waitfor = MNT_WAIT;
13150         }
13151         /*
13152          * Try freeing inodedep in case all dependencies have been removed.
13153          */
13154         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
13155                 (void) free_inodedep(inodedep);
13156         return (0);
13157 }
13158
13159 /*
13160  * Flush an inode dependency list.
13161  */
13162 static int
13163 flush_deplist(
13164         struct allocdirectlst *listhead,
13165         int waitfor,
13166         int *errorp)
13167 {
13168         struct allocdirect *adp;
13169         struct newblk *newblk;
13170         struct ufsmount *ump;
13171         struct buf *bp;
13172
13173         if ((adp = TAILQ_FIRST(listhead)) == NULL)
13174                 return (0);
13175         ump = VFSTOUFS(adp->ad_list.wk_mp);
13176         LOCK_OWNED(ump);
13177         TAILQ_FOREACH(adp, listhead, ad_next) {
13178                 newblk = (struct newblk *)adp;
13179                 if (newblk->nb_jnewblk != NULL) {
13180                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13181                         return (1);
13182                 }
13183                 if (newblk->nb_state & DEPCOMPLETE)
13184                         continue;
13185                 bp = newblk->nb_bmsafemap->sm_buf;
13186                 bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
13187                 if (bp == NULL) {
13188                         if (waitfor == MNT_NOWAIT)
13189                                 continue;
13190                         return (1);
13191                 }
13192                 FREE_LOCK(ump);
13193                 if (waitfor == MNT_NOWAIT)
13194                         bawrite(bp);
13195                 else
13196                         *errorp = bwrite(bp);
13197                 ACQUIRE_LOCK(ump);
13198                 return (1);
13199         }
13200         return (0);
13201 }
13202
13203 /*
13204  * Flush dependencies associated with an allocdirect block.
13205  */
13206 static int
13207 flush_newblk_dep(
13208         struct vnode *vp,
13209         struct mount *mp,
13210         ufs_lbn_t lbn)
13211 {
13212         struct newblk *newblk;
13213         struct ufsmount *ump;
13214         struct bufobj *bo;
13215         struct inode *ip;
13216         struct buf *bp;
13217         ufs2_daddr_t blkno;
13218         int error;
13219
13220         error = 0;
13221         bo = &vp->v_bufobj;
13222         ip = VTOI(vp);
13223         blkno = DIP(ip, i_db[lbn]);
13224         if (blkno == 0)
13225                 panic("flush_newblk_dep: Missing block");
13226         ump = VFSTOUFS(mp);
13227         ACQUIRE_LOCK(ump);
13228         /*
13229          * Loop until all dependencies related to this block are satisfied.
13230          * We must be careful to restart after each sleep in case a write
13231          * completes some part of this process for us.
13232          */
13233         for (;;) {
13234                 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
13235                         FREE_LOCK(ump);
13236                         break;
13237                 }
13238                 if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
13239                         panic("flush_newblk_dep: Bad newblk %p", newblk);
13240                 /*
13241                  * Flush the journal.
13242                  */
13243                 if (newblk->nb_jnewblk != NULL) {
13244                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13245                         continue;
13246                 }
13247                 /*
13248                  * Write the bitmap dependency.
13249                  */
13250                 if ((newblk->nb_state & DEPCOMPLETE) == 0) {
13251                         bp = newblk->nb_bmsafemap->sm_buf;
13252                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13253                         if (bp == NULL)
13254                                 continue;
13255                         FREE_LOCK(ump);
13256                         error = bwrite(bp);
13257                         if (error)
13258                                 break;
13259                         ACQUIRE_LOCK(ump);
13260                         continue;
13261                 }
13262                 /*
13263                  * Write the buffer.
13264                  */
13265                 FREE_LOCK(ump);
13266                 BO_LOCK(bo);
13267                 bp = gbincore(bo, lbn);
13268                 if (bp != NULL) {
13269                         error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
13270                             LK_INTERLOCK, BO_LOCKPTR(bo));
13271                         if (error == ENOLCK) {
13272                                 ACQUIRE_LOCK(ump);
13273                                 error = 0;
13274                                 continue; /* Slept, retry */
13275                         }
13276                         if (error != 0)
13277                                 break;  /* Failed */
13278                         if (bp->b_flags & B_DELWRI) {
13279                                 bremfree(bp);
13280                                 error = bwrite(bp);
13281                                 if (error)
13282                                         break;
13283                         } else
13284                                 BUF_UNLOCK(bp);
13285                 } else
13286                         BO_UNLOCK(bo);
13287                 /*
13288                  * We have to wait for the direct pointers to
13289                  * point at the newdirblk before the dependency
13290                  * will go away.
13291                  */
13292                 error = ffs_update(vp, 1);
13293                 if (error)
13294                         break;
13295                 ACQUIRE_LOCK(ump);
13296         }
13297         return (error);
13298 }
13299
13300 /*
13301  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
13302  */
13303 static int
13304 flush_pagedep_deps(
13305         struct vnode *pvp,
13306         struct mount *mp,
13307         struct diraddhd *diraddhdp,
13308         struct buf *locked_bp)
13309 {
13310         struct inodedep *inodedep;
13311         struct inoref *inoref;
13312         struct ufsmount *ump;
13313         struct diradd *dap;
13314         struct vnode *vp;
13315         int error = 0;
13316         struct buf *bp;
13317         ino_t inum;
13318         struct diraddhd unfinished;
13319
13320         LIST_INIT(&unfinished);
13321         ump = VFSTOUFS(mp);
13322         LOCK_OWNED(ump);
13323 restart:
13324         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
13325                 /*
13326                  * Flush ourselves if this directory entry
13327                  * has a MKDIR_PARENT dependency.
13328                  */
13329                 if (dap->da_state & MKDIR_PARENT) {
13330                         FREE_LOCK(ump);
13331                         if ((error = ffs_update(pvp, 1)) != 0)
13332                                 break;
13333                         ACQUIRE_LOCK(ump);
13334                         /*
13335                          * If that cleared dependencies, go on to next.
13336                          */
13337                         if (dap != LIST_FIRST(diraddhdp))
13338                                 continue;
13339                         /*
13340                          * All MKDIR_PARENT dependencies and all the
13341                          * NEWBLOCK pagedeps that are contained in direct
13342                          * blocks were resolved by doing above ffs_update.
13343                          * Pagedeps contained in indirect blocks may
13344                          * require a complete sync'ing of the directory.
13345                          * We are in the midst of doing a complete sync,
13346                          * so if they are not resolved in this pass we
13347                          * defer them for now as they will be sync'ed by
13348                          * our caller shortly.
13349                          */
13350                         LIST_REMOVE(dap, da_pdlist);
13351                         LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13352                         continue;
13353                 }
13354                 /*
13355                  * A newly allocated directory must have its "." and
13356                  * ".." entries written out before its name can be
13357                  * committed in its parent.
13358                  */
13359                 inum = dap->da_newinum;
13360                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13361                         panic("flush_pagedep_deps: lost inode1");
13362                 /*
13363                  * Wait for any pending journal adds to complete so we don't
13364                  * cause rollbacks while syncing.
13365                  */
13366                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13367                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13368                             == DEPCOMPLETE) {
13369                                 jwait(&inoref->if_list, MNT_WAIT);
13370                                 goto restart;
13371                         }
13372                 }
13373                 if (dap->da_state & MKDIR_BODY) {
13374                         FREE_LOCK(ump);
13375                         error = get_parent_vp(pvp, mp, inum, locked_bp,
13376                             diraddhdp, &unfinished, &vp);
13377                         if (error != 0)
13378                                 break;
13379                         error = flush_newblk_dep(vp, mp, 0);
13380                         /*
13381                          * If we still have the dependency we might need to
13382                          * update the vnode to sync the new link count to
13383                          * disk.
13384                          */
13385                         if (error == 0 && dap == LIST_FIRST(diraddhdp))
13386                                 error = ffs_update(vp, 1);
13387                         vput(vp);
13388                         if (error != 0)
13389                                 break;
13390                         ACQUIRE_LOCK(ump);
13391                         /*
13392                          * If that cleared dependencies, go on to next.
13393                          */
13394                         if (dap != LIST_FIRST(diraddhdp))
13395                                 continue;
13396                         if (dap->da_state & MKDIR_BODY) {
13397                                 inodedep_lookup(UFSTOVFS(ump), inum, 0,
13398                                     &inodedep);
13399                                 panic("flush_pagedep_deps: MKDIR_BODY "
13400                                     "inodedep %p dap %p vp %p",
13401                                     inodedep, dap, vp);
13402                         }
13403                 }
13404                 /*
13405                  * Flush the inode on which the directory entry depends.
13406                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13407                  * the only remaining dependency is that the updated inode
13408                  * count must get pushed to disk. The inode has already
13409                  * been pushed into its inode buffer (via VOP_UPDATE) at
13410                  * the time of the reference count change. So we need only
13411                  * locate that buffer, ensure that there will be no rollback
13412                  * caused by a bitmap dependency, then write the inode buffer.
13413                  */
13414 retry:
13415                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13416                         panic("flush_pagedep_deps: lost inode");
13417                 /*
13418                  * If the inode still has bitmap dependencies,
13419                  * push them to disk.
13420                  */
13421                 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13422                         bp = inodedep->id_bmsafemap->sm_buf;
13423                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13424                         if (bp == NULL)
13425                                 goto retry;
13426                         FREE_LOCK(ump);
13427                         if ((error = bwrite(bp)) != 0)
13428                                 break;
13429                         ACQUIRE_LOCK(ump);
13430                         if (dap != LIST_FIRST(diraddhdp))
13431                                 continue;
13432                 }
13433                 /*
13434                  * If the inode is still sitting in a buffer waiting
13435                  * to be written or waiting for the link count to be
13436                  * adjusted update it here to flush it to disk.
13437                  */
13438                 if (dap == LIST_FIRST(diraddhdp)) {
13439                         FREE_LOCK(ump);
13440                         error = get_parent_vp(pvp, mp, inum, locked_bp,
13441                             diraddhdp, &unfinished, &vp);
13442                         if (error != 0)
13443                                 break;
13444                         error = ffs_update(vp, 1);
13445                         vput(vp);
13446                         if (error)
13447                                 break;
13448                         ACQUIRE_LOCK(ump);
13449                 }
13450                 /*
13451                  * If we have failed to get rid of all the dependencies
13452                  * then something is seriously wrong.
13453                  */
13454                 if (dap == LIST_FIRST(diraddhdp)) {
13455                         inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13456                         panic("flush_pagedep_deps: failed to flush "
13457                             "inodedep %p ino %ju dap %p",
13458                             inodedep, (uintmax_t)inum, dap);
13459                 }
13460         }
13461         if (error)
13462                 ACQUIRE_LOCK(ump);
13463         while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13464                 LIST_REMOVE(dap, da_pdlist);
13465                 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13466         }
13467         return (error);
13468 }
13469
13470 /*
13471  * A large burst of file addition or deletion activity can drive the
13472  * memory load excessively high. First attempt to slow things down
13473  * using the techniques below. If that fails, this routine requests
13474  * the offending operations to fall back to running synchronously
13475  * until the memory load returns to a reasonable level.
13476  */
13477 int
13478 softdep_slowdown(struct vnode *vp)
13479 {
13480         struct ufsmount *ump;
13481         int jlow;
13482         int max_softdeps_hard;
13483
13484         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13485             ("softdep_slowdown called on non-softdep filesystem"));
13486         ump = VFSTOUFS(vp->v_mount);
13487         ACQUIRE_LOCK(ump);
13488         jlow = 0;
13489         /*
13490          * Check for journal space if needed.
13491          */
13492         if (DOINGSUJ(vp)) {
13493                 if (journal_space(ump, 0) == 0)
13494                         jlow = 1;
13495         }
13496         /*
13497          * If the system is under its limits and our filesystem is
13498          * not responsible for more than our share of the usage and
13499          * we are not low on journal space, then no need to slow down.
13500          */
13501         max_softdeps_hard = max_softdeps * 11 / 10;
13502         if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13503             dep_current[D_INODEDEP] < max_softdeps_hard &&
13504             dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13505             dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13506             ump->softdep_curdeps[D_DIRREM] <
13507             (max_softdeps_hard / 2) / stat_flush_threads &&
13508             ump->softdep_curdeps[D_INODEDEP] <
13509             max_softdeps_hard / stat_flush_threads &&
13510             ump->softdep_curdeps[D_INDIRDEP] <
13511             (max_softdeps_hard / 1000) / stat_flush_threads &&
13512             ump->softdep_curdeps[D_FREEBLKS] <
13513             max_softdeps_hard / stat_flush_threads) {
13514                 FREE_LOCK(ump);
13515                 return (0);
13516         }
13517         /*
13518          * If the journal is low or our filesystem is over its limit
13519          * then speedup the cleanup.
13520          */
13521         if (ump->softdep_curdeps[D_INDIRDEP] <
13522             (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13523                 softdep_speedup(ump);
13524         stat_sync_limit_hit += 1;
13525         FREE_LOCK(ump);
13526         /*
13527          * We only slow down the rate at which new dependencies are
13528          * generated if we are not using journaling. With journaling,
13529          * the cleanup should always be sufficient to keep things
13530          * under control.
13531          */
13532         if (DOINGSUJ(vp))
13533                 return (0);
13534         return (1);
13535 }
13536
13537 static int
13538 softdep_request_cleanup_filter(struct vnode *vp, void *arg __unused)
13539 {
13540         return ((vp->v_iflag & VI_OWEINACT) != 0 && vp->v_usecount == 0 &&
13541             ((vp->v_vflag & VV_NOSYNC) != 0 || VTOI(vp)->i_effnlink == 0));
13542 }
13543
13544 static void
13545 softdep_request_cleanup_inactivate(struct mount *mp)
13546 {
13547         struct vnode *vp, *mvp;
13548         int error;
13549
13550         MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, softdep_request_cleanup_filter,
13551             NULL) {
13552                 vholdl(vp);
13553                 vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
13554                 VI_LOCK(vp);
13555                 if (IS_UFS(vp) && vp->v_usecount == 0) {
13556                         while ((vp->v_iflag & VI_OWEINACT) != 0) {
13557                                 error = vinactive(vp);
13558                                 if (error != 0 && error != ERELOOKUP)
13559                                         break;
13560                         }
13561                         atomic_add_int(&stat_delayed_inact, 1);
13562                 }
13563                 VOP_UNLOCK(vp);
13564                 vdropl(vp);
13565         }
13566 }
13567
13568 /*
13569  * Called by the allocation routines when they are about to fail
13570  * in the hope that we can free up the requested resource (inodes
13571  * or disk space).
13572  *
13573  * First check to see if the work list has anything on it. If it has,
13574  * clean up entries until we successfully free the requested resource.
13575  * Because this process holds inodes locked, we cannot handle any remove
13576  * requests that might block on a locked inode as that could lead to
13577  * deadlock. If the worklist yields none of the requested resource,
13578  * start syncing out vnodes to free up the needed space.
13579  */
13580 int
13581 softdep_request_cleanup(
13582         struct fs *fs,
13583         struct vnode *vp,
13584         struct ucred *cred,
13585         int resource)
13586 {
13587         struct ufsmount *ump;
13588         struct mount *mp;
13589         long starttime;
13590         ufs2_daddr_t needed;
13591         int error, failed_vnode;
13592
13593         /*
13594          * If we are being called because of a process doing a
13595          * copy-on-write, then it is not safe to process any
13596          * worklist items as we will recurse into the copyonwrite
13597          * routine.  This will result in an incoherent snapshot.
13598          * If the vnode that we hold is a snapshot, we must avoid
13599          * handling other resources that could cause deadlock.
13600          */
13601         if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13602                 return (0);
13603
13604         if (resource == FLUSH_BLOCKS_WAIT)
13605                 stat_cleanup_blkrequests += 1;
13606         else
13607                 stat_cleanup_inorequests += 1;
13608
13609         mp = vp->v_mount;
13610         ump = VFSTOUFS(mp);
13611         mtx_assert(UFS_MTX(ump), MA_OWNED);
13612         UFS_UNLOCK(ump);
13613         error = ffs_update(vp, 1);
13614         if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13615                 UFS_LOCK(ump);
13616                 return (0);
13617         }
13618         /*
13619          * If we are in need of resources, start by cleaning up
13620          * any block removals associated with our inode.
13621          */
13622         ACQUIRE_LOCK(ump);
13623         process_removes(vp);
13624         process_truncates(vp);
13625         FREE_LOCK(ump);
13626         /*
13627          * Now clean up at least as many resources as we will need.
13628          *
13629          * When requested to clean up inodes, the number that are needed
13630          * is set by the number of simultaneous writers (mnt_writeopcount)
13631          * plus a bit of slop (2) in case some more writers show up while
13632          * we are cleaning.
13633          *
13634          * When requested to free up space, the amount of space that
13635          * we need is enough blocks to allocate a full-sized segment
13636          * (fs_contigsumsize). The number of such segments that will
13637          * be needed is set by the number of simultaneous writers
13638          * (mnt_writeopcount) plus a bit of slop (2) in case some more
13639          * writers show up while we are cleaning.
13640          *
13641          * Additionally, if we are unpriviledged and allocating space,
13642          * we need to ensure that we clean up enough blocks to get the
13643          * needed number of blocks over the threshold of the minimum
13644          * number of blocks required to be kept free by the filesystem
13645          * (fs_minfree).
13646          */
13647         if (resource == FLUSH_INODES_WAIT) {
13648                 needed = vfs_mount_fetch_counter(vp->v_mount,
13649                     MNT_COUNT_WRITEOPCOUNT) + 2;
13650         } else if (resource == FLUSH_BLOCKS_WAIT) {
13651                 needed = (vfs_mount_fetch_counter(vp->v_mount,
13652                     MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize;
13653                 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE))
13654                         needed += fragstoblks(fs,
13655                             roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13656                             fs->fs_cstotal.cs_nffree, fs->fs_frag));
13657         } else {
13658                 printf("softdep_request_cleanup: Unknown resource type %d\n",
13659                     resource);
13660                 UFS_LOCK(ump);
13661                 return (0);
13662         }
13663         starttime = time_second;
13664 retry:
13665         if (resource == FLUSH_BLOCKS_WAIT &&
13666             fs->fs_cstotal.cs_nbfree <= needed)
13667                 softdep_send_speedup(ump, needed * fs->fs_bsize,
13668                     BIO_SPEEDUP_TRIM);
13669         if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13670             fs->fs_cstotal.cs_nbfree <= needed) ||
13671             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13672             fs->fs_cstotal.cs_nifree <= needed)) {
13673                 ACQUIRE_LOCK(ump);
13674                 if (ump->softdep_on_worklist > 0 &&
13675                     process_worklist_item(UFSTOVFS(ump),
13676                     ump->softdep_on_worklist, LK_NOWAIT) != 0)
13677                         stat_worklist_push += 1;
13678                 FREE_LOCK(ump);
13679         }
13680
13681         /*
13682          * Check that there are vnodes pending inactivation.  As they
13683          * have been unlinked, inactivating them will free up their
13684          * inodes.
13685          */
13686         ACQUIRE_LOCK(ump);
13687         if (resource == FLUSH_INODES_WAIT &&
13688             fs->fs_cstotal.cs_nifree <= needed &&
13689             fs->fs_pendinginodes <= needed) {
13690                 if ((ump->um_softdep->sd_flags & FLUSH_DI_ACTIVE) == 0) {
13691                         ump->um_softdep->sd_flags |= FLUSH_DI_ACTIVE;
13692                         FREE_LOCK(ump);
13693                         softdep_request_cleanup_inactivate(mp);
13694                         ACQUIRE_LOCK(ump);
13695                         ump->um_softdep->sd_flags &= ~FLUSH_DI_ACTIVE;
13696                         wakeup(&ump->um_softdep->sd_flags);
13697                 } else {
13698                         while ((ump->um_softdep->sd_flags &
13699                             FLUSH_DI_ACTIVE) != 0) {
13700                                 msleep(&ump->um_softdep->sd_flags,
13701                                     LOCK_PTR(ump), PVM, "ffsvina", hz);
13702                         }
13703                 }
13704         }
13705         FREE_LOCK(ump);
13706
13707         /*
13708          * If we still need resources and there are no more worklist
13709          * entries to process to obtain them, we have to start flushing
13710          * the dirty vnodes to force the release of additional requests
13711          * to the worklist that we can then process to reap addition
13712          * resources. We walk the vnodes associated with the mount point
13713          * until we get the needed worklist requests that we can reap.
13714          *
13715          * If there are several threads all needing to clean the same
13716          * mount point, only one is allowed to walk the mount list.
13717          * When several threads all try to walk the same mount list,
13718          * they end up competing with each other and often end up in
13719          * livelock. This approach ensures that forward progress is
13720          * made at the cost of occational ENOSPC errors being returned
13721          * that might otherwise have been avoided.
13722          */
13723         error = 1;
13724         if ((resource == FLUSH_BLOCKS_WAIT &&
13725              fs->fs_cstotal.cs_nbfree <= needed) ||
13726             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13727              fs->fs_cstotal.cs_nifree <= needed)) {
13728                 ACQUIRE_LOCK(ump);
13729                 if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
13730                         ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
13731                         FREE_LOCK(ump);
13732                         failed_vnode = softdep_request_cleanup_flush(mp, ump);
13733                         ACQUIRE_LOCK(ump);
13734                         ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
13735                         wakeup(&ump->um_softdep->sd_flags);
13736                         FREE_LOCK(ump);
13737                         if (ump->softdep_on_worklist > 0) {
13738                                 stat_cleanup_retries += 1;
13739                                 if (!failed_vnode)
13740                                         goto retry;
13741                         }
13742                 } else {
13743                         while ((ump->um_softdep->sd_flags &
13744                             FLUSH_RC_ACTIVE) != 0) {
13745                                 msleep(&ump->um_softdep->sd_flags,
13746                                     LOCK_PTR(ump), PVM, "ffsrca", hz);
13747                         }
13748                         FREE_LOCK(ump);
13749                         error = 0;
13750                 }
13751                 stat_cleanup_failures += 1;
13752         }
13753         if (time_second - starttime > stat_cleanup_high_delay)
13754                 stat_cleanup_high_delay = time_second - starttime;
13755         UFS_LOCK(ump);
13756         return (error);
13757 }
13758
13759 /*
13760  * Scan the vnodes for the specified mount point flushing out any
13761  * vnodes that can be locked without waiting. Finally, try to flush
13762  * the device associated with the mount point if it can be locked
13763  * without waiting.
13764  *
13765  * We return 0 if we were able to lock every vnode in our scan.
13766  * If we had to skip one or more vnodes, we return 1.
13767  */
13768 static int
13769 softdep_request_cleanup_flush(struct mount *mp, struct ufsmount *ump)
13770 {
13771         struct thread *td;
13772         struct vnode *lvp, *mvp;
13773         int failed_vnode;
13774
13775         failed_vnode = 0;
13776         td = curthread;
13777         MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13778                 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13779                         VI_UNLOCK(lvp);
13780                         continue;
13781                 }
13782                 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) {
13783                         failed_vnode = 1;
13784                         continue;
13785                 }
13786                 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
13787                         vput(lvp);
13788                         continue;
13789                 }
13790                 (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13791                 vput(lvp);
13792         }
13793         lvp = ump->um_devvp;
13794         if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13795                 VOP_FSYNC(lvp, MNT_NOWAIT, td);
13796                 VOP_UNLOCK(lvp);
13797         }
13798         return (failed_vnode);
13799 }
13800
13801 static bool
13802 softdep_excess_items(struct ufsmount *ump, int item)
13803 {
13804
13805         KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13806         return (dep_current[item] > max_softdeps &&
13807             ump->softdep_curdeps[item] > max_softdeps /
13808             stat_flush_threads);
13809 }
13810
13811 static void
13812 schedule_cleanup(struct mount *mp)
13813 {
13814         struct ufsmount *ump;
13815         struct thread *td;
13816
13817         ump = VFSTOUFS(mp);
13818         LOCK_OWNED(ump);
13819         FREE_LOCK(ump);
13820         td = curthread;
13821         if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13822             (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13823                 /*
13824                  * No ast is delivered to kernel threads, so nobody
13825                  * would deref the mp.  Some kernel threads
13826                  * explicitly check for AST, e.g. NFS daemon does
13827                  * this in the serving loop.
13828                  */
13829                 return;
13830         }
13831         if (td->td_su != NULL)
13832                 vfs_rel(td->td_su);
13833         vfs_ref(mp);
13834         td->td_su = mp;
13835         thread_lock(td);
13836         td->td_flags |= TDF_ASTPENDING;
13837         thread_unlock(td);
13838 }
13839
13840 static void
13841 softdep_ast_cleanup_proc(struct thread *td)
13842 {
13843         struct mount *mp;
13844         struct ufsmount *ump;
13845         int error;
13846         bool req;
13847
13848         while ((mp = td->td_su) != NULL) {
13849                 td->td_su = NULL;
13850                 error = vfs_busy(mp, MBF_NOWAIT);
13851                 vfs_rel(mp);
13852                 if (error != 0)
13853                         return;
13854                 if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13855                         ump = VFSTOUFS(mp);
13856                         for (;;) {
13857                                 req = false;
13858                                 ACQUIRE_LOCK(ump);
13859                                 if (softdep_excess_items(ump, D_INODEDEP)) {
13860                                         req = true;
13861                                         request_cleanup(mp, FLUSH_INODES);
13862                                 }
13863                                 if (softdep_excess_items(ump, D_DIRREM)) {
13864                                         req = true;
13865                                         request_cleanup(mp, FLUSH_BLOCKS);
13866                                 }
13867                                 FREE_LOCK(ump);
13868                                 if (softdep_excess_items(ump, D_NEWBLK) ||
13869                                     softdep_excess_items(ump, D_ALLOCDIRECT) ||
13870                                     softdep_excess_items(ump, D_ALLOCINDIR)) {
13871                                         error = vn_start_write(NULL, &mp,
13872                                             V_WAIT);
13873                                         if (error == 0) {
13874                                                 req = true;
13875                                                 VFS_SYNC(mp, MNT_WAIT);
13876                                                 vn_finished_write(mp);
13877                                         }
13878                                 }
13879                                 if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13880                                         break;
13881                         }
13882                 }
13883                 vfs_unbusy(mp);
13884         }
13885         if ((mp = td->td_su) != NULL) {
13886                 td->td_su = NULL;
13887                 vfs_rel(mp);
13888         }
13889 }
13890
13891 /*
13892  * If memory utilization has gotten too high, deliberately slow things
13893  * down and speed up the I/O processing.
13894  */
13895 static int
13896 request_cleanup(struct mount *mp, int resource)
13897 {
13898         struct thread *td = curthread;
13899         struct ufsmount *ump;
13900
13901         ump = VFSTOUFS(mp);
13902         LOCK_OWNED(ump);
13903         /*
13904          * We never hold up the filesystem syncer or buf daemon.
13905          */
13906         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13907                 return (0);
13908         /*
13909          * First check to see if the work list has gotten backlogged.
13910          * If it has, co-opt this process to help clean up two entries.
13911          * Because this process may hold inodes locked, we cannot
13912          * handle any remove requests that might block on a locked
13913          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13914          * to avoid recursively processing the worklist.
13915          */
13916         if (ump->softdep_on_worklist > max_softdeps / 10) {
13917                 td->td_pflags |= TDP_SOFTDEP;
13918                 process_worklist_item(mp, 2, LK_NOWAIT);
13919                 td->td_pflags &= ~TDP_SOFTDEP;
13920                 stat_worklist_push += 2;
13921                 return(1);
13922         }
13923         /*
13924          * Next, we attempt to speed up the syncer process. If that
13925          * is successful, then we allow the process to continue.
13926          */
13927         if (softdep_speedup(ump) &&
13928             resource != FLUSH_BLOCKS_WAIT &&
13929             resource != FLUSH_INODES_WAIT)
13930                 return(0);
13931         /*
13932          * If we are resource constrained on inode dependencies, try
13933          * flushing some dirty inodes. Otherwise, we are constrained
13934          * by file deletions, so try accelerating flushes of directories
13935          * with removal dependencies. We would like to do the cleanup
13936          * here, but we probably hold an inode locked at this point and
13937          * that might deadlock against one that we try to clean. So,
13938          * the best that we can do is request the syncer daemon to do
13939          * the cleanup for us.
13940          */
13941         switch (resource) {
13942         case FLUSH_INODES:
13943         case FLUSH_INODES_WAIT:
13944                 ACQUIRE_GBLLOCK(&lk);
13945                 stat_ino_limit_push += 1;
13946                 req_clear_inodedeps += 1;
13947                 FREE_GBLLOCK(&lk);
13948                 stat_countp = &stat_ino_limit_hit;
13949                 break;
13950
13951         case FLUSH_BLOCKS:
13952         case FLUSH_BLOCKS_WAIT:
13953                 ACQUIRE_GBLLOCK(&lk);
13954                 stat_blk_limit_push += 1;
13955                 req_clear_remove += 1;
13956                 FREE_GBLLOCK(&lk);
13957                 stat_countp = &stat_blk_limit_hit;
13958                 break;
13959
13960         default:
13961                 panic("request_cleanup: unknown type");
13962         }
13963         /*
13964          * Hopefully the syncer daemon will catch up and awaken us.
13965          * We wait at most tickdelay before proceeding in any case.
13966          */
13967         ACQUIRE_GBLLOCK(&lk);
13968         FREE_LOCK(ump);
13969         proc_waiting += 1;
13970         if (callout_pending(&softdep_callout) == FALSE)
13971                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13972                     pause_timer, 0);
13973
13974         if ((td->td_pflags & TDP_KTHREAD) == 0)
13975                 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13976         proc_waiting -= 1;
13977         FREE_GBLLOCK(&lk);
13978         ACQUIRE_LOCK(ump);
13979         return (1);
13980 }
13981
13982 /*
13983  * Awaken processes pausing in request_cleanup and clear proc_waiting
13984  * to indicate that there is no longer a timer running. Pause_timer
13985  * will be called with the global softdep mutex (&lk) locked.
13986  */
13987 static void
13988 pause_timer(void *arg)
13989 {
13990
13991         GBLLOCK_OWNED(&lk);
13992         /*
13993          * The callout_ API has acquired mtx and will hold it around this
13994          * function call.
13995          */
13996         *stat_countp += proc_waiting;
13997         wakeup(&proc_waiting);
13998 }
13999
14000 /*
14001  * If requested, try removing inode or removal dependencies.
14002  */
14003 static void
14004 check_clear_deps(struct mount *mp)
14005 {
14006         struct ufsmount *ump;
14007         bool suj_susp;
14008
14009         /*
14010          * Tell the lower layers that any TRIM or WRITE transactions that have
14011          * been delayed for performance reasons should proceed to help alleviate
14012          * the shortage faster. The race between checking req_* and the softdep
14013          * mutex (lk) is fine since this is an advisory operation that at most
14014          * causes deferred work to be done sooner.
14015          */
14016         ump = VFSTOUFS(mp);
14017         suj_susp = ump->um_softdep->sd_jblocks != NULL &&
14018             ump->softdep_jblocks->jb_suspended;
14019         if (req_clear_remove || req_clear_inodedeps || suj_susp) {
14020                 FREE_LOCK(ump);
14021                 softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE);
14022                 ACQUIRE_LOCK(ump);
14023         }
14024
14025         /*
14026          * If we are suspended, it may be because of our using
14027          * too many inodedeps, so help clear them out.
14028          */
14029         if (suj_susp)
14030                 clear_inodedeps(mp);
14031
14032         /*
14033          * General requests for cleanup of backed up dependencies
14034          */
14035         ACQUIRE_GBLLOCK(&lk);
14036         if (req_clear_inodedeps) {
14037                 req_clear_inodedeps -= 1;
14038                 FREE_GBLLOCK(&lk);
14039                 clear_inodedeps(mp);
14040                 ACQUIRE_GBLLOCK(&lk);
14041                 wakeup(&proc_waiting);
14042         }
14043         if (req_clear_remove) {
14044                 req_clear_remove -= 1;
14045                 FREE_GBLLOCK(&lk);
14046                 clear_remove(mp);
14047                 ACQUIRE_GBLLOCK(&lk);
14048                 wakeup(&proc_waiting);
14049         }
14050         FREE_GBLLOCK(&lk);
14051 }
14052
14053 /*
14054  * Flush out a directory with at least one removal dependency in an effort to
14055  * reduce the number of dirrem, freefile, and freeblks dependency structures.
14056  */
14057 static void
14058 clear_remove(struct mount *mp)
14059 {
14060         struct pagedep_hashhead *pagedephd;
14061         struct pagedep *pagedep;
14062         struct ufsmount *ump;
14063         struct vnode *vp;
14064         struct bufobj *bo;
14065         int error, cnt;
14066         ino_t ino;
14067
14068         ump = VFSTOUFS(mp);
14069         LOCK_OWNED(ump);
14070
14071         for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
14072                 pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
14073                 if (ump->pagedep_nextclean > ump->pagedep_hash_size)
14074                         ump->pagedep_nextclean = 0;
14075                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
14076                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
14077                                 continue;
14078                         ino = pagedep->pd_ino;
14079                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
14080                                 continue;
14081                         FREE_LOCK(ump);
14082
14083                         /*
14084                          * Let unmount clear deps
14085                          */
14086                         error = vfs_busy(mp, MBF_NOWAIT);
14087                         if (error != 0)
14088                                 goto finish_write;
14089                         error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
14090                              FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP);
14091                         vfs_unbusy(mp);
14092                         if (error != 0) {
14093                                 softdep_error("clear_remove: vget", error);
14094                                 goto finish_write;
14095                         }
14096                         MPASS(VTOI(vp)->i_mode != 0);
14097                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
14098                                 softdep_error("clear_remove: fsync", error);
14099                         bo = &vp->v_bufobj;
14100                         BO_LOCK(bo);
14101                         drain_output(vp);
14102                         BO_UNLOCK(bo);
14103                         vput(vp);
14104                 finish_write:
14105                         vn_finished_write(mp);
14106                         ACQUIRE_LOCK(ump);
14107                         return;
14108                 }
14109         }
14110 }
14111
14112 /*
14113  * Clear out a block of dirty inodes in an effort to reduce
14114  * the number of inodedep dependency structures.
14115  */
14116 static void
14117 clear_inodedeps(struct mount *mp)
14118 {
14119         struct inodedep_hashhead *inodedephd;
14120         struct inodedep *inodedep;
14121         struct ufsmount *ump;
14122         struct vnode *vp;
14123         struct fs *fs;
14124         int error, cnt;
14125         ino_t firstino, lastino, ino;
14126
14127         ump = VFSTOUFS(mp);
14128         fs = ump->um_fs;
14129         LOCK_OWNED(ump);
14130         /*
14131          * Pick a random inode dependency to be cleared.
14132          * We will then gather up all the inodes in its block
14133          * that have dependencies and flush them out.
14134          */
14135         for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
14136                 inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
14137                 if (ump->inodedep_nextclean > ump->inodedep_hash_size)
14138                         ump->inodedep_nextclean = 0;
14139                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
14140                         break;
14141         }
14142         if (inodedep == NULL)
14143                 return;
14144         /*
14145          * Find the last inode in the block with dependencies.
14146          */
14147         firstino = rounddown2(inodedep->id_ino, INOPB(fs));
14148         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
14149                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
14150                         break;
14151         /*
14152          * Asynchronously push all but the last inode with dependencies.
14153          * Synchronously push the last inode with dependencies to ensure
14154          * that the inode block gets written to free up the inodedeps.
14155          */
14156         for (ino = firstino; ino <= lastino; ino++) {
14157                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
14158                         continue;
14159                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
14160                         continue;
14161                 FREE_LOCK(ump);
14162                 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
14163                 if (error != 0) {
14164                         vn_finished_write(mp);
14165                         ACQUIRE_LOCK(ump);
14166                         return;
14167                 }
14168                 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
14169                     FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP)) != 0) {
14170                         softdep_error("clear_inodedeps: vget", error);
14171                         vfs_unbusy(mp);
14172                         vn_finished_write(mp);
14173                         ACQUIRE_LOCK(ump);
14174                         return;
14175                 }
14176                 vfs_unbusy(mp);
14177                 if (VTOI(vp)->i_mode == 0) {
14178                         vgone(vp);
14179                 } else if (ino == lastino) {
14180                         do {
14181                                 error = ffs_syncvnode(vp, MNT_WAIT, 0);
14182                         } while (error == ERELOOKUP);
14183                         if (error != 0)
14184                                 softdep_error("clear_inodedeps: fsync1", error);
14185                 } else {
14186                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
14187                                 softdep_error("clear_inodedeps: fsync2", error);
14188                         BO_LOCK(&vp->v_bufobj);
14189                         drain_output(vp);
14190                         BO_UNLOCK(&vp->v_bufobj);
14191                 }
14192                 vput(vp);
14193                 vn_finished_write(mp);
14194                 ACQUIRE_LOCK(ump);
14195         }
14196 }
14197
14198 void
14199 softdep_buf_append(struct buf *bp, struct workhead *wkhd)
14200 {
14201         struct worklist *wk;
14202         struct ufsmount *ump;
14203
14204         if ((wk = LIST_FIRST(wkhd)) == NULL)
14205                 return;
14206         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
14207             ("softdep_buf_append called on non-softdep filesystem"));
14208         ump = VFSTOUFS(wk->wk_mp);
14209         ACQUIRE_LOCK(ump);
14210         while ((wk = LIST_FIRST(wkhd)) != NULL) {
14211                 WORKLIST_REMOVE(wk);
14212                 WORKLIST_INSERT(&bp->b_dep, wk);
14213         }
14214         FREE_LOCK(ump);
14215
14216 }
14217
14218 void
14219 softdep_inode_append(
14220         struct inode *ip,
14221         struct ucred *cred,
14222         struct workhead *wkhd)
14223 {
14224         struct buf *bp;
14225         struct fs *fs;
14226         struct ufsmount *ump;
14227         int error;
14228
14229         ump = ITOUMP(ip);
14230         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
14231             ("softdep_inode_append called on non-softdep filesystem"));
14232         fs = ump->um_fs;
14233         error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
14234             (int)fs->fs_bsize, cred, &bp);
14235         if (error) {
14236                 bqrelse(bp);
14237                 softdep_freework(wkhd);
14238                 return;
14239         }
14240         softdep_buf_append(bp, wkhd);
14241         bqrelse(bp);
14242 }
14243
14244 void
14245 softdep_freework(struct workhead *wkhd)
14246 {
14247         struct worklist *wk;
14248         struct ufsmount *ump;
14249
14250         if ((wk = LIST_FIRST(wkhd)) == NULL)
14251                 return;
14252         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
14253             ("softdep_freework called on non-softdep filesystem"));
14254         ump = VFSTOUFS(wk->wk_mp);
14255         ACQUIRE_LOCK(ump);
14256         handle_jwork(wkhd);
14257         FREE_LOCK(ump);
14258 }
14259
14260 static struct ufsmount *
14261 softdep_bp_to_mp(struct buf *bp)
14262 {
14263         struct mount *mp;
14264         struct vnode *vp;
14265
14266         if (LIST_EMPTY(&bp->b_dep))
14267                 return (NULL);
14268         vp = bp->b_vp;
14269         KASSERT(vp != NULL,
14270             ("%s, buffer with dependencies lacks vnode", __func__));
14271
14272         /*
14273          * The ump mount point is stable after we get a correct
14274          * pointer, since bp is locked and this prevents unmount from
14275          * proceeding.  But to get to it, we cannot dereference bp->b_dep
14276          * head wk_mp, because we do not yet own SU ump lock and
14277          * workitem might be freed while dereferenced.
14278          */
14279 retry:
14280         switch (vp->v_type) {
14281         case VCHR:
14282                 VI_LOCK(vp);
14283                 mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
14284                 VI_UNLOCK(vp);
14285                 if (mp == NULL)
14286                         goto retry;
14287                 break;
14288         case VREG:
14289         case VDIR:
14290         case VLNK:
14291         case VFIFO:
14292         case VSOCK:
14293                 mp = vp->v_mount;
14294                 break;
14295         case VBLK:
14296                 vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
14297                 /* FALLTHROUGH */
14298         case VNON:
14299         case VBAD:
14300         case VMARKER:
14301                 mp = NULL;
14302                 break;
14303         default:
14304                 vn_printf(vp, "unknown vnode type");
14305                 mp = NULL;
14306                 break;
14307         }
14308         return (VFSTOUFS(mp));
14309 }
14310
14311 /*
14312  * Function to determine if the buffer has outstanding dependencies
14313  * that will cause a roll-back if the buffer is written. If wantcount
14314  * is set, return number of dependencies, otherwise just yes or no.
14315  */
14316 static int
14317 softdep_count_dependencies(struct buf *bp, int wantcount)
14318 {
14319         struct worklist *wk;
14320         struct ufsmount *ump;
14321         struct bmsafemap *bmsafemap;
14322         struct freework *freework;
14323         struct inodedep *inodedep;
14324         struct indirdep *indirdep;
14325         struct freeblks *freeblks;
14326         struct allocindir *aip;
14327         struct pagedep *pagedep;
14328         struct dirrem *dirrem;
14329         struct newblk *newblk;
14330         struct mkdir *mkdir;
14331         struct diradd *dap;
14332         int i, retval;
14333
14334         ump = softdep_bp_to_mp(bp);
14335         if (ump == NULL)
14336                 return (0);
14337         retval = 0;
14338         ACQUIRE_LOCK(ump);
14339         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
14340                 switch (wk->wk_type) {
14341                 case D_INODEDEP:
14342                         inodedep = WK_INODEDEP(wk);
14343                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
14344                                 /* bitmap allocation dependency */
14345                                 retval += 1;
14346                                 if (!wantcount)
14347                                         goto out;
14348                         }
14349                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
14350                                 /* direct block pointer dependency */
14351                                 retval += 1;
14352                                 if (!wantcount)
14353                                         goto out;
14354                         }
14355                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
14356                                 /* direct block pointer dependency */
14357                                 retval += 1;
14358                                 if (!wantcount)
14359                                         goto out;
14360                         }
14361                         if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
14362                                 /* Add reference dependency. */
14363                                 retval += 1;
14364                                 if (!wantcount)
14365                                         goto out;
14366                         }
14367                         continue;
14368
14369                 case D_INDIRDEP:
14370                         indirdep = WK_INDIRDEP(wk);
14371
14372                         TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
14373                                 /* indirect truncation dependency */
14374                                 retval += 1;
14375                                 if (!wantcount)
14376                                         goto out;
14377                         }
14378
14379                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
14380                                 /* indirect block pointer dependency */
14381                                 retval += 1;
14382                                 if (!wantcount)
14383                                         goto out;
14384                         }
14385                         continue;
14386
14387                 case D_PAGEDEP:
14388                         pagedep = WK_PAGEDEP(wk);
14389                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
14390                                 if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
14391                                         /* Journal remove ref dependency. */
14392                                         retval += 1;
14393                                         if (!wantcount)
14394                                                 goto out;
14395                                 }
14396                         }
14397                         for (i = 0; i < DAHASHSZ; i++) {
14398                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
14399                                         /* directory entry dependency */
14400                                         retval += 1;
14401                                         if (!wantcount)
14402                                                 goto out;
14403                                 }
14404                         }
14405                         continue;
14406
14407                 case D_BMSAFEMAP:
14408                         bmsafemap = WK_BMSAFEMAP(wk);
14409                         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
14410                                 /* Add reference dependency. */
14411                                 retval += 1;
14412                                 if (!wantcount)
14413                                         goto out;
14414                         }
14415                         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
14416                                 /* Allocate block dependency. */
14417                                 retval += 1;
14418                                 if (!wantcount)
14419                                         goto out;
14420                         }
14421                         continue;
14422
14423                 case D_FREEBLKS:
14424                         freeblks = WK_FREEBLKS(wk);
14425                         if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
14426                                 /* Freeblk journal dependency. */
14427                                 retval += 1;
14428                                 if (!wantcount)
14429                                         goto out;
14430                         }
14431                         continue;
14432
14433                 case D_ALLOCDIRECT:
14434                 case D_ALLOCINDIR:
14435                         newblk = WK_NEWBLK(wk);
14436                         if (newblk->nb_jnewblk) {
14437                                 /* Journal allocate dependency. */
14438                                 retval += 1;
14439                                 if (!wantcount)
14440                                         goto out;
14441                         }
14442                         continue;
14443
14444                 case D_MKDIR:
14445                         mkdir = WK_MKDIR(wk);
14446                         if (mkdir->md_jaddref) {
14447                                 /* Journal reference dependency. */
14448                                 retval += 1;
14449                                 if (!wantcount)
14450                                         goto out;
14451                         }
14452                         continue;
14453
14454                 case D_FREEWORK:
14455                 case D_FREEDEP:
14456                 case D_JSEGDEP:
14457                 case D_JSEG:
14458                 case D_SBDEP:
14459                         /* never a dependency on these blocks */
14460                         continue;
14461
14462                 default:
14463                         panic("softdep_count_dependencies: Unexpected type %s",
14464                             TYPENAME(wk->wk_type));
14465                         /* NOTREACHED */
14466                 }
14467         }
14468 out:
14469         FREE_LOCK(ump);
14470         return (retval);
14471 }
14472
14473 /*
14474  * Acquire exclusive access to a buffer.
14475  * Must be called with a locked mtx parameter.
14476  * Return acquired buffer or NULL on failure.
14477  */
14478 static struct buf *
14479 getdirtybuf(struct buf *bp,
14480         struct rwlock *lock,
14481         int waitfor)
14482 {
14483         int error;
14484
14485         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
14486                 if (waitfor != MNT_WAIT)
14487                         return (NULL);
14488                 error = BUF_LOCK(bp,
14489                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
14490                 /*
14491                  * Even if we successfully acquire bp here, we have dropped
14492                  * lock, which may violates our guarantee.
14493                  */
14494                 if (error == 0)
14495                         BUF_UNLOCK(bp);
14496                 else if (error != ENOLCK)
14497                         panic("getdirtybuf: inconsistent lock: %d", error);
14498                 rw_wlock(lock);
14499                 return (NULL);
14500         }
14501         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14502                 if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
14503                         rw_wunlock(lock);
14504                         BO_LOCK(bp->b_bufobj);
14505                         BUF_UNLOCK(bp);
14506                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14507                                 bp->b_vflags |= BV_BKGRDWAIT;
14508                                 msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14509                                        PRIBIO | PDROP, "getbuf", 0);
14510                         } else
14511                                 BO_UNLOCK(bp->b_bufobj);
14512                         rw_wlock(lock);
14513                         return (NULL);
14514                 }
14515                 BUF_UNLOCK(bp);
14516                 if (waitfor != MNT_WAIT)
14517                         return (NULL);
14518 #ifdef DEBUG_VFS_LOCKS
14519                 if (bp->b_vp->v_type != VCHR)
14520                         ASSERT_BO_WLOCKED(bp->b_bufobj);
14521 #endif
14522                 bp->b_vflags |= BV_BKGRDWAIT;
14523                 rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14524                 return (NULL);
14525         }
14526         if ((bp->b_flags & B_DELWRI) == 0) {
14527                 BUF_UNLOCK(bp);
14528                 return (NULL);
14529         }
14530         bremfree(bp);
14531         return (bp);
14532 }
14533
14534 /*
14535  * Check if it is safe to suspend the file system now.  On entry,
14536  * the vnode interlock for devvp should be held.  Return 0 with
14537  * the mount interlock held if the file system can be suspended now,
14538  * otherwise return EAGAIN with the mount interlock held.
14539  */
14540 int
14541 softdep_check_suspend(struct mount *mp,
14542                       struct vnode *devvp,
14543                       int softdep_depcnt,
14544                       int softdep_accdepcnt,
14545                       int secondary_writes,
14546                       int secondary_accwrites)
14547 {
14548         struct buf *bp;
14549         struct bufobj *bo;
14550         struct ufsmount *ump;
14551         struct inodedep *inodedep;
14552         struct indirdep *indirdep;
14553         struct worklist *wk, *nextwk;
14554         int error, unlinked;
14555
14556         bo = &devvp->v_bufobj;
14557         ASSERT_BO_WLOCKED(bo);
14558
14559         /*
14560          * If we are not running with soft updates, then we need only
14561          * deal with secondary writes as we try to suspend.
14562          */
14563         if (MOUNTEDSOFTDEP(mp) == 0) {
14564                 MNT_ILOCK(mp);
14565                 while (mp->mnt_secondary_writes != 0) {
14566                         BO_UNLOCK(bo);
14567                         msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14568                             (PUSER - 1) | PDROP, "secwr", 0);
14569                         BO_LOCK(bo);
14570                         MNT_ILOCK(mp);
14571                 }
14572
14573                 /*
14574                  * Reasons for needing more work before suspend:
14575                  * - Dirty buffers on devvp.
14576                  * - Secondary writes occurred after start of vnode sync loop
14577                  */
14578                 error = 0;
14579                 if (bo->bo_numoutput > 0 ||
14580                     bo->bo_dirty.bv_cnt > 0 ||
14581                     secondary_writes != 0 ||
14582                     mp->mnt_secondary_writes != 0 ||
14583                     secondary_accwrites != mp->mnt_secondary_accwrites)
14584                         error = EAGAIN;
14585                 BO_UNLOCK(bo);
14586                 return (error);
14587         }
14588
14589         /*
14590          * If we are running with soft updates, then we need to coordinate
14591          * with them as we try to suspend.
14592          */
14593         ump = VFSTOUFS(mp);
14594         for (;;) {
14595                 if (!TRY_ACQUIRE_LOCK(ump)) {
14596                         BO_UNLOCK(bo);
14597                         ACQUIRE_LOCK(ump);
14598                         FREE_LOCK(ump);
14599                         BO_LOCK(bo);
14600                         continue;
14601                 }
14602                 MNT_ILOCK(mp);
14603                 if (mp->mnt_secondary_writes != 0) {
14604                         FREE_LOCK(ump);
14605                         BO_UNLOCK(bo);
14606                         msleep(&mp->mnt_secondary_writes,
14607                                MNT_MTX(mp),
14608                                (PUSER - 1) | PDROP, "secwr", 0);
14609                         BO_LOCK(bo);
14610                         continue;
14611                 }
14612                 break;
14613         }
14614
14615         unlinked = 0;
14616         if (MOUNTEDSUJ(mp)) {
14617                 for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14618                     inodedep != NULL;
14619                     inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14620                         if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14621                             UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14622                             UNLINKONLIST) ||
14623                             !check_inodedep_free(inodedep))
14624                                 continue;
14625                         unlinked++;
14626                 }
14627         }
14628
14629         /*
14630          * XXX Check for orphaned indirdep dependency structures.
14631          *
14632          * During forcible unmount after a disk failure there is a
14633          * bug that causes one or more indirdep dependency structures
14634          * to fail to be deallocated. We check for them here and clean
14635          * them up so that the unmount can succeed.
14636          */
14637         if ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0 && ump->softdep_deps > 0 &&
14638             ump->softdep_deps == ump->softdep_curdeps[D_INDIRDEP]) {
14639                 LIST_FOREACH_SAFE(wk, &ump->softdep_alldeps[D_INDIRDEP],
14640                     wk_all, nextwk) {
14641                         indirdep = WK_INDIRDEP(wk);
14642                         if ((indirdep->ir_state & (GOINGAWAY | DEPCOMPLETE)) !=
14643                             (GOINGAWAY | DEPCOMPLETE) ||
14644                             !TAILQ_EMPTY(&indirdep->ir_trunc) ||
14645                             !LIST_EMPTY(&indirdep->ir_completehd) ||
14646                             !LIST_EMPTY(&indirdep->ir_writehd) ||
14647                             !LIST_EMPTY(&indirdep->ir_donehd) ||
14648                             !LIST_EMPTY(&indirdep->ir_deplisthd) ||
14649                             indirdep->ir_saveddata != NULL ||
14650                             indirdep->ir_savebp == NULL) {
14651                                 printf("%s: skipping orphaned indirdep %p\n",
14652                                     __FUNCTION__, indirdep);
14653                                 continue;
14654                         }
14655                         printf("%s: freeing orphaned indirdep %p\n",
14656                             __FUNCTION__, indirdep);
14657                         bp = indirdep->ir_savebp;
14658                         indirdep->ir_savebp = NULL;
14659                         free_indirdep(indirdep);
14660                         FREE_LOCK(ump);
14661                         brelse(bp);
14662                         while (!TRY_ACQUIRE_LOCK(ump)) {
14663                                 BO_UNLOCK(bo);
14664                                 ACQUIRE_LOCK(ump);
14665                                 FREE_LOCK(ump);
14666                                 BO_LOCK(bo);
14667                         }
14668                 }
14669         }
14670
14671         /*
14672          * Reasons for needing more work before suspend:
14673          * - Dirty buffers on devvp.
14674          * - Dependency structures still exist
14675          * - Softdep activity occurred after start of vnode sync loop
14676          * - Secondary writes occurred after start of vnode sync loop
14677          */
14678         error = 0;
14679         if (bo->bo_numoutput > 0 ||
14680             bo->bo_dirty.bv_cnt > 0 ||
14681             softdep_depcnt != unlinked ||
14682             ump->softdep_deps != unlinked ||
14683             softdep_accdepcnt != ump->softdep_accdeps ||
14684             secondary_writes != 0 ||
14685             mp->mnt_secondary_writes != 0 ||
14686             secondary_accwrites != mp->mnt_secondary_accwrites)
14687                 error = EAGAIN;
14688         FREE_LOCK(ump);
14689         BO_UNLOCK(bo);
14690         return (error);
14691 }
14692
14693 /*
14694  * Get the number of dependency structures for the file system, both
14695  * the current number and the total number allocated.  These will
14696  * later be used to detect that softdep processing has occurred.
14697  */
14698 void
14699 softdep_get_depcounts(struct mount *mp,
14700                       int *softdep_depsp,
14701                       int *softdep_accdepsp)
14702 {
14703         struct ufsmount *ump;
14704
14705         if (MOUNTEDSOFTDEP(mp) == 0) {
14706                 *softdep_depsp = 0;
14707                 *softdep_accdepsp = 0;
14708                 return;
14709         }
14710         ump = VFSTOUFS(mp);
14711         ACQUIRE_LOCK(ump);
14712         *softdep_depsp = ump->softdep_deps;
14713         *softdep_accdepsp = ump->softdep_accdeps;
14714         FREE_LOCK(ump);
14715 }
14716
14717 /*
14718  * Wait for pending output on a vnode to complete.
14719  */
14720 static void
14721 drain_output(struct vnode *vp)
14722 {
14723
14724         ASSERT_VOP_LOCKED(vp, "drain_output");
14725         (void)bufobj_wwait(&vp->v_bufobj, 0, 0);
14726 }
14727
14728 /*
14729  * Called whenever a buffer that is being invalidated or reallocated
14730  * contains dependencies. This should only happen if an I/O error has
14731  * occurred. The routine is called with the buffer locked.
14732  */
14733 static void
14734 softdep_deallocate_dependencies(struct buf *bp)
14735 {
14736
14737         if ((bp->b_ioflags & BIO_ERROR) == 0)
14738                 panic("softdep_deallocate_dependencies: dangling deps");
14739         if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14740                 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14741         else
14742                 printf("softdep_deallocate_dependencies: "
14743                     "got error %d while accessing filesystem\n", bp->b_error);
14744         if (bp->b_error != ENXIO)
14745                 panic("softdep_deallocate_dependencies: unrecovered I/O error");
14746 }
14747
14748 /*
14749  * Function to handle asynchronous write errors in the filesystem.
14750  */
14751 static void
14752 softdep_error(char *func, int error)
14753 {
14754
14755         /* XXX should do something better! */
14756         printf("%s: got error %d while accessing filesystem\n", func, error);
14757 }
14758
14759 #ifdef DDB
14760
14761 /* exported to ffs_vfsops.c */
14762 extern void db_print_ffs(struct ufsmount *ump);
14763 void
14764 db_print_ffs(struct ufsmount *ump)
14765 {
14766         db_printf("mp %p (%s) devvp %p\n", ump->um_mountp,
14767             ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp);
14768         db_printf("    fs %p ", ump->um_fs);
14769
14770         if (ump->um_softdep != NULL) {
14771                 db_printf("su_wl %d su_deps %d su_req %d\n",
14772                     ump->softdep_on_worklist, ump->softdep_deps,
14773                     ump->softdep_req);
14774         } else {
14775                 db_printf("su disabled\n");
14776         }
14777 }
14778
14779 static void
14780 worklist_print(struct worklist *wk, int verbose)
14781 {
14782
14783         if (!verbose) {
14784                 db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk,
14785                     wk->wk_state, PRINT_SOFTDEP_FLAGS);
14786                 return;
14787         }
14788         db_printf("worklist: %p type %s state 0x%b next %p\n    ", wk,
14789             TYPENAME(wk->wk_type), wk->wk_state, PRINT_SOFTDEP_FLAGS,
14790             LIST_NEXT(wk, wk_list));
14791         db_print_ffs(VFSTOUFS(wk->wk_mp));
14792 }
14793
14794 static void
14795 inodedep_print(struct inodedep *inodedep, int verbose)
14796 {
14797
14798         worklist_print(&inodedep->id_list, 0);
14799         db_printf("    fs %p ino %jd inoblk %jd delta %jd nlink %jd\n",
14800             inodedep->id_fs,
14801             (intmax_t)inodedep->id_ino,
14802             (intmax_t)fsbtodb(inodedep->id_fs,
14803                 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14804             (intmax_t)inodedep->id_nlinkdelta,
14805             (intmax_t)inodedep->id_savednlink);
14806
14807         if (verbose == 0)
14808                 return;
14809
14810         db_printf("    bmsafemap %p, mkdiradd %p, inoreflst %p\n",
14811             inodedep->id_bmsafemap,
14812             inodedep->id_mkdiradd,
14813             TAILQ_FIRST(&inodedep->id_inoreflst));
14814         db_printf("    dirremhd %p, pendinghd %p, bufwait %p\n",
14815             LIST_FIRST(&inodedep->id_dirremhd),
14816             LIST_FIRST(&inodedep->id_pendinghd),
14817             LIST_FIRST(&inodedep->id_bufwait));
14818         db_printf("    inowait %p, inoupdt %p, newinoupdt %p\n",
14819             LIST_FIRST(&inodedep->id_inowait),
14820             TAILQ_FIRST(&inodedep->id_inoupdt),
14821             TAILQ_FIRST(&inodedep->id_newinoupdt));
14822         db_printf("    extupdt %p, newextupdt %p, freeblklst %p\n",
14823             TAILQ_FIRST(&inodedep->id_extupdt),
14824             TAILQ_FIRST(&inodedep->id_newextupdt),
14825             TAILQ_FIRST(&inodedep->id_freeblklst));
14826         db_printf("    saveino %p, savedsize %jd, savedextsize %jd\n",
14827             inodedep->id_savedino1,
14828             (intmax_t)inodedep->id_savedsize,
14829             (intmax_t)inodedep->id_savedextsize);
14830 }
14831
14832 static void
14833 newblk_print(struct newblk *nbp)
14834 {
14835
14836         worklist_print(&nbp->nb_list, 0);
14837         db_printf("    newblkno %jd\n", (intmax_t)nbp->nb_newblkno);
14838         db_printf("    jnewblk %p, bmsafemap %p, freefrag %p\n",
14839             &nbp->nb_jnewblk,
14840             &nbp->nb_bmsafemap,
14841             &nbp->nb_freefrag);
14842         db_printf("    indirdeps %p, newdirblk %p, jwork %p\n",
14843             LIST_FIRST(&nbp->nb_indirdeps),
14844             LIST_FIRST(&nbp->nb_newdirblk),
14845             LIST_FIRST(&nbp->nb_jwork));
14846 }
14847
14848 static void
14849 allocdirect_print(struct allocdirect *adp)
14850 {
14851
14852         newblk_print(&adp->ad_block);
14853         db_printf("    oldblkno %jd, oldsize %ld, newsize %ld\n",
14854             adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize);
14855         db_printf("    offset %d, inodedep %p\n",
14856             adp->ad_offset, adp->ad_inodedep);
14857 }
14858
14859 static void
14860 allocindir_print(struct allocindir *aip)
14861 {
14862
14863         newblk_print(&aip->ai_block);
14864         db_printf("    oldblkno %jd, lbn %jd\n",
14865             (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn);
14866         db_printf("    offset %d, indirdep %p\n",
14867             aip->ai_offset, aip->ai_indirdep);
14868 }
14869
14870 static void
14871 mkdir_print(struct mkdir *mkdir)
14872 {
14873
14874         worklist_print(&mkdir->md_list, 0);
14875         db_printf("    diradd %p, jaddref %p, buf %p\n",
14876                 mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf);
14877 }
14878
14879 DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep)
14880 {
14881
14882         if (have_addr == 0) {
14883                 db_printf("inodedep address required\n");
14884                 return;
14885         }
14886         inodedep_print((struct inodedep*)addr, 1);
14887 }
14888
14889 DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps)
14890 {
14891         struct inodedep_hashhead *inodedephd;
14892         struct inodedep *inodedep;
14893         struct ufsmount *ump;
14894         int cnt;
14895
14896         if (have_addr == 0) {
14897                 db_printf("ufsmount address required\n");
14898                 return;
14899         }
14900         ump = (struct ufsmount *)addr;
14901         for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14902                 inodedephd = &ump->inodedep_hashtbl[cnt];
14903                 LIST_FOREACH(inodedep, inodedephd, id_hash) {
14904                         inodedep_print(inodedep, 0);
14905                 }
14906         }
14907 }
14908
14909 DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist)
14910 {
14911
14912         if (have_addr == 0) {
14913                 db_printf("worklist address required\n");
14914                 return;
14915         }
14916         worklist_print((struct worklist *)addr, 1);
14917 }
14918
14919 DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead)
14920 {
14921         struct worklist *wk;
14922         struct workhead *wkhd;
14923
14924         if (have_addr == 0) {
14925                 db_printf("worklist address required "
14926                     "(for example value in bp->b_dep)\n");
14927                 return;
14928         }
14929         /*
14930          * We often do not have the address of the worklist head but
14931          * instead a pointer to its first entry (e.g., we have the
14932          * contents of bp->b_dep rather than &bp->b_dep). But the back
14933          * pointer of bp->b_dep will point at the head of the list, so
14934          * we cheat and use that instead. If we are in the middle of
14935          * a list we will still get the same result, so nothing
14936          * unexpected will result.
14937          */
14938         wk = (struct worklist *)addr;
14939         if (wk == NULL)
14940                 return;
14941         wkhd = (struct workhead *)wk->wk_list.le_prev;
14942         LIST_FOREACH(wk, wkhd, wk_list) {
14943                 switch(wk->wk_type) {
14944                 case D_INODEDEP:
14945                         inodedep_print(WK_INODEDEP(wk), 0);
14946                         continue;
14947                 case D_ALLOCDIRECT:
14948                         allocdirect_print(WK_ALLOCDIRECT(wk));
14949                         continue;
14950                 case D_ALLOCINDIR:
14951                         allocindir_print(WK_ALLOCINDIR(wk));
14952                         continue;
14953                 case D_MKDIR:
14954                         mkdir_print(WK_MKDIR(wk));
14955                         continue;
14956                 default:
14957                         worklist_print(wk, 0);
14958                         continue;
14959                 }
14960         }
14961 }
14962
14963 DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir)
14964 {
14965         if (have_addr == 0) {
14966                 db_printf("mkdir address required\n");
14967                 return;
14968         }
14969         mkdir_print((struct mkdir *)addr);
14970 }
14971
14972 DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list)
14973 {
14974         struct mkdirlist *mkdirlisthd;
14975         struct mkdir *mkdir;
14976
14977         if (have_addr == 0) {
14978                 db_printf("mkdir listhead address required\n");
14979                 return;
14980         }
14981         mkdirlisthd = (struct mkdirlist *)addr;
14982         LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14983                 mkdir_print(mkdir);
14984                 if (mkdir->md_diradd != NULL) {
14985                         db_printf("    ");
14986                         worklist_print(&mkdir->md_diradd->da_list, 0);
14987                 }
14988                 if (mkdir->md_jaddref != NULL) {
14989                         db_printf("    ");
14990                         worklist_print(&mkdir->md_jaddref->ja_list, 0);
14991                 }
14992         }
14993 }
14994
14995 DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect)
14996 {
14997         if (have_addr == 0) {
14998                 db_printf("allocdirect address required\n");
14999                 return;
15000         }
15001         allocdirect_print((struct allocdirect *)addr);
15002 }
15003
15004 DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir)
15005 {
15006         if (have_addr == 0) {
15007                 db_printf("allocindir address required\n");
15008                 return;
15009         }
15010         allocindir_print((struct allocindir *)addr);
15011 }
15012
15013 #endif /* DDB */
15014
15015 #endif /* SOFTUPDATES */