sys/ufs/ffs/ffs_softdep.c

   1 /*-
   2  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
   3  *
   4  * The soft updates code is derived from the appendix of a University
   5  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
   6  * "Soft Updates: A Solution to the Metadata Update Problem in File
   7  * Systems", CSE-TR-254-95, August 1995).
   8  *
   9  * Further information about soft updates can be obtained from:
  10  *
  11  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  12  *      1614 Oxford Street              mckusick@mckusick.com
  13  *      Berkeley, CA 94709-1608         +1-510-843-9542
  14  *      USA
  15  *
  16  * Redistribution and use in source and binary forms, with or without
  17  * modification, are permitted provided that the following conditions
  18  * are met:
  19  *
  20  * 1. Redistributions of source code must retain the above copyright
  21  *    notice, this list of conditions and the following disclaimer.
  22  * 2. Redistributions in binary form must reproduce the above copyright
  23  *    notice, this list of conditions and the following disclaimer in the
  24  *    documentation and/or other materials provided with the distribution.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  27  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  29  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  30  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  39  */
  40
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * For now we want the safety net that the DEBUG flag provides.
  46  */
  47 #ifndef DEBUG
  48 #define DEBUG
  49 #endif
  50
  51 #include <sys/param.h>
  52 #include <sys/kernel.h>
  53 #include <sys/systm.h>
  54 #include <sys/bio.h>
  55 #include <sys/buf.h>
  56 #include <sys/kdb.h>
  57 #include <sys/kthread.h>
  58 #include <sys/lock.h>
  59 #include <sys/malloc.h>
  60 #include <sys/mount.h>
  61 #include <sys/mutex.h>
  62 #include <sys/proc.h>
  63 #include <sys/stat.h>
  64 #include <sys/sysctl.h>
  65 #include <sys/syslog.h>
  66 #include <sys/vnode.h>
  67 #include <sys/conf.h>
  68 #include <ufs/ufs/dir.h>
  69 #include <ufs/ufs/extattr.h>
  70 #include <ufs/ufs/quota.h>
  71 #include <ufs/ufs/inode.h>
  72 #include <ufs/ufs/ufsmount.h>
  73 #include <ufs/ffs/fs.h>
  74 #include <ufs/ffs/softdep.h>
  75 #include <ufs/ffs/ffs_extern.h>
  76 #include <ufs/ufs/ufs_extern.h>
  77
  78 #include <vm/vm.h>
  79
  80 #include "opt_ffs.h"
  81 #include "opt_quota.h"
  82
  83 #ifndef SOFTUPDATES
  84
  85 int
  86 softdep_flushfiles(oldmnt, flags, td)
  87         struct mount *oldmnt;
  88         int flags;
  89         struct thread *td;
  90 {
  91
  92         panic("softdep_flushfiles called");
  93 }
  94
  95 int
  96 softdep_mount(devvp, mp, fs, cred)
  97         struct vnode *devvp;
  98         struct mount *mp;
  99         struct fs *fs;
 100         struct ucred *cred;
 101 {
 102
 103         return (0);
 104 }
 105
 106 void
 107 softdep_initialize()
 108 {
 109
 110         return;
 111 }
 112
 113 void
 114 softdep_uninitialize()
 115 {
 116
 117         return;
 118 }
 119
 120 void
 121 softdep_setup_inomapdep(bp, ip, newinum)
 122         struct buf *bp;
 123         struct inode *ip;
 124         ino_t newinum;
 125 {
 126
 127         panic("softdep_setup_inomapdep called");
 128 }
 129
 130 void
 131 softdep_setup_blkmapdep(bp, mp, newblkno)
 132         struct buf *bp;
 133         struct mount *mp;
 134         ufs2_daddr_t newblkno;
 135 {
 136
 137         panic("softdep_setup_blkmapdep called");
 138 }
 139
 140 void
 141 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 142         struct inode *ip;
 143         ufs_lbn_t lbn;
 144         ufs2_daddr_t newblkno;
 145         ufs2_daddr_t oldblkno;
 146         long newsize;
 147         long oldsize;
 148         struct buf *bp;
 149 {
 150
 151         panic("softdep_setup_allocdirect called");
 152 }
 153
 154 void
 155 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 156         struct inode *ip;
 157         ufs_lbn_t lbn;
 158         ufs2_daddr_t newblkno;
 159         ufs2_daddr_t oldblkno;
 160         long newsize;
 161         long oldsize;
 162         struct buf *bp;
 163 {
 164
 165         panic("softdep_setup_allocext called");
 166 }
 167
 168 void
 169 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 170         struct inode *ip;
 171         ufs_lbn_t lbn;
 172         struct buf *bp;
 173         int ptrno;
 174         ufs2_daddr_t newblkno;
 175         ufs2_daddr_t oldblkno;
 176         struct buf *nbp;
 177 {
 178
 179         panic("softdep_setup_allocindir_page called");
 180 }
 181
 182 void
 183 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 184         struct buf *nbp;
 185         struct inode *ip;
 186         struct buf *bp;
 187         int ptrno;
 188         ufs2_daddr_t newblkno;
 189 {
 190
 191         panic("softdep_setup_allocindir_meta called");
 192 }
 193
 194 void
 195 softdep_setup_freeblocks(ip, length, flags)
 196         struct inode *ip;
 197         off_t length;
 198         int flags;
 199 {
 200
 201         panic("softdep_setup_freeblocks called");
 202 }
 203
 204 void
 205 softdep_freefile(pvp, ino, mode)
 206                 struct vnode *pvp;
 207                 ino_t ino;
 208                 int mode;
 209 {
 210
 211         panic("softdep_freefile called");
 212 }
 213
 214 int
 215 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 216         struct buf *bp;
 217         struct inode *dp;
 218         off_t diroffset;
 219         ino_t newinum;
 220         struct buf *newdirbp;
 221         int isnewblk;
 222 {
 223
 224         panic("softdep_setup_directory_add called");
 225 }
 226
 227 void
 228 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 229         struct inode *dp;
 230         caddr_t base;
 231         caddr_t oldloc;
 232         caddr_t newloc;
 233         int entrysize;
 234 {
 235
 236         panic("softdep_change_directoryentry_offset called");
 237 }
 238
 239 void
 240 softdep_setup_remove(bp, dp, ip, isrmdir)
 241         struct buf *bp;
 242         struct inode *dp;
 243         struct inode *ip;
 244         int isrmdir;
 245 {
 246
 247         panic("softdep_setup_remove called");
 248 }
 249
 250 void
 251 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 252         struct buf *bp;
 253         struct inode *dp;
 254         struct inode *ip;
 255         ino_t newinum;
 256         int isrmdir;
 257 {
 258
 259         panic("softdep_setup_directory_change called");
 260 }
 261
 262 void
 263 softdep_change_linkcnt(ip)
 264         struct inode *ip;
 265 {
 266
 267         panic("softdep_change_linkcnt called");
 268 }
 269
 270 void
 271 softdep_load_inodeblock(ip)
 272         struct inode *ip;
 273 {
 274
 275         panic("softdep_load_inodeblock called");
 276 }
 277
 278 void
 279 softdep_update_inodeblock(ip, bp, waitfor)
 280         struct inode *ip;
 281         struct buf *bp;
 282         int waitfor;
 283 {
 284
 285         panic("softdep_update_inodeblock called");
 286 }
 287
 288 int
 289 softdep_fsync(vp)
 290         struct vnode *vp;       /* the "in_core" copy of the inode */
 291 {
 292
 293         return (0);
 294 }
 295
 296 void
 297 softdep_fsync_mountdev(vp)
 298         struct vnode *vp;
 299 {
 300
 301         return;
 302 }
 303
 304 int
 305 softdep_flushworklist(oldmnt, countp, td)
 306         struct mount *oldmnt;
 307         int *countp;
 308         struct thread *td;
 309 {
 310
 311         *countp = 0;
 312         return (0);
 313 }
 314
 315 int
 316 softdep_sync_metadata(struct vnode *vp)
 317 {
 318
 319         return (0);
 320 }
 321
 322 int
 323 softdep_slowdown(vp)
 324         struct vnode *vp;
 325 {
 326
 327         panic("softdep_slowdown called");
 328 }
 329
 330 void
 331 softdep_releasefile(ip)
 332         struct inode *ip;       /* inode with the zero effective link count */
 333 {
 334
 335         panic("softdep_releasefile called");
 336 }
 337
 338 int
 339 softdep_request_cleanup(fs, vp)
 340         struct fs *fs;
 341         struct vnode *vp;
 342 {
 343
 344         return (0);
 345 }
 346
 347 int
 348 softdep_check_suspend(struct mount *mp,
 349                       struct vnode *devvp,
 350                       int softdep_deps,
 351                       int softdep_accdeps,
 352                       int secondary_writes,
 353                       int secondary_accwrites)
 354 {
 355         struct bufobj *bo;
 356         int error;
 357
 358         (void) softdep_deps,
 359         (void) softdep_accdeps;
 360
 361         ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
 362         bo = &devvp->v_bufobj;
 363
 364         for (;;) {
 365                 if (!MNT_ITRYLOCK(mp)) {
 366                         VI_UNLOCK(devvp);
 367                         MNT_ILOCK(mp);
 368                         MNT_IUNLOCK(mp);
 369                         VI_LOCK(devvp);
 370                         continue;
 371                 }
 372                 if (mp->mnt_secondary_writes != 0) {
 373                         VI_UNLOCK(devvp);
 374                         msleep(&mp->mnt_secondary_writes,
 375                                MNT_MTX(mp),
 376                                (PUSER - 1) | PDROP, "secwr", 0);
 377                         VI_LOCK(devvp);
 378                         continue;
 379                 }
 380                 break;
 381         }
 382
 383         /*
 384          * Reasons for needing more work before suspend:
 385          * - Dirty buffers on devvp.
 386          * - Secondary writes occurred after start of vnode sync loop
 387          */
 388         error = 0;
 389         if (bo->bo_numoutput > 0 ||
 390             bo->bo_dirty.bv_cnt > 0 ||
 391             secondary_writes != 0 ||
 392             mp->mnt_secondary_writes != 0 ||
 393             secondary_accwrites != mp->mnt_secondary_accwrites)
 394                 error = EAGAIN;
 395         VI_UNLOCK(devvp);
 396         return (error);
 397 }
 398
 399 void
 400 softdep_get_depcounts(struct mount *mp,
 401                       int *softdepactivep,
 402                       int *softdepactiveaccp)
 403 {
 404         (void) mp;
 405         *softdepactivep = 0;
 406         *softdepactiveaccp = 0;
 407 }
 408
 409 #else
 410 /*
 411  * These definitions need to be adapted to the system to which
 412  * this file is being ported.
 413  */
 414 /*
 415  * malloc types defined for the softdep system.
 416  */
 417 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
 418 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
 419 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
 420 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
 421 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
 422 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
 423 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
 424 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
 425 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
 426 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
 427 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 428 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 429 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
 430 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
 431 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
 432
 433 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
 434
 435 #define D_PAGEDEP       0
 436 #define D_INODEDEP      1
 437 #define D_NEWBLK        2
 438 #define D_BMSAFEMAP     3
 439 #define D_ALLOCDIRECT   4
 440 #define D_INDIRDEP      5
 441 #define D_ALLOCINDIR    6
 442 #define D_FREEFRAG      7
 443 #define D_FREEBLKS      8
 444 #define D_FREEFILE      9
 445 #define D_DIRADD        10
 446 #define D_MKDIR         11
 447 #define D_DIRREM        12
 448 #define D_NEWDIRBLK     13
 449 #define D_LAST          D_NEWDIRBLK
 450
 451 /*
 452  * translate from workitem type to memory type
 453  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 454  */
 455 static struct malloc_type *memtype[] = {
 456         M_PAGEDEP,
 457         M_INODEDEP,
 458         M_NEWBLK,
 459         M_BMSAFEMAP,
 460         M_ALLOCDIRECT,
 461         M_INDIRDEP,
 462         M_ALLOCINDIR,
 463         M_FREEFRAG,
 464         M_FREEBLKS,
 465         M_FREEFILE,
 466         M_DIRADD,
 467         M_MKDIR,
 468         M_DIRREM,
 469         M_NEWDIRBLK
 470 };
 471
 472 #define DtoM(type) (memtype[type])
 473
 474 /*
 475  * Names of malloc types.
 476  */
 477 #define TYPENAME(type)  \
 478         ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
 479 /*
 480  * End system adaptation definitions.
 481  */
 482
 483 /*
 484  * Forward declarations.
 485  */
 486 struct inodedep_hashhead;
 487 struct newblk_hashhead;
 488 struct pagedep_hashhead;
 489
 490 /*
 491  * Internal function prototypes.
 492  */
 493 static  void softdep_error(char *, int);
 494 static  void drain_output(struct vnode *);
 495 static  struct buf *getdirtybuf(struct buf *, struct mtx *, int);
 496 static  void clear_remove(struct thread *);
 497 static  void clear_inodedeps(struct thread *);
 498 static  int flush_pagedep_deps(struct vnode *, struct mount *,
 499             struct diraddhd *);
 500 static  int flush_inodedep_deps(struct mount *, ino_t);
 501 static  int flush_deplist(struct allocdirectlst *, int, int *);
 502 static  int handle_written_filepage(struct pagedep *, struct buf *);
 503 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 504 static  int handle_written_inodeblock(struct inodedep *, struct buf *);
 505 static  void handle_allocdirect_partdone(struct allocdirect *);
 506 static  void handle_allocindir_partdone(struct allocindir *);
 507 static  void initiate_write_filepage(struct pagedep *, struct buf *);
 508 static  void handle_written_mkdir(struct mkdir *, int);
 509 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 510 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 511 static  void handle_workitem_freefile(struct freefile *);
 512 static  void handle_workitem_remove(struct dirrem *, struct vnode *);
 513 static  struct dirrem *newdirrem(struct buf *, struct inode *,
 514             struct inode *, int, struct dirrem **);
 515 static  void free_diradd(struct diradd *);
 516 static  void free_allocindir(struct allocindir *, struct inodedep *);
 517 static  void free_newdirblk(struct newdirblk *);
 518 static  int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
 519             ufs2_daddr_t *);
 520 static  void deallocate_dependencies(struct buf *, struct inodedep *);
 521 static  void free_allocdirect(struct allocdirectlst *,
 522             struct allocdirect *, int);
 523 static  int check_inode_unwritten(struct inodedep *);
 524 static  int free_inodedep(struct inodedep *);
 525 static  void handle_workitem_freeblocks(struct freeblks *, int);
 526 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 527 static  void setup_allocindir_phase2(struct buf *, struct inode *,
 528             struct allocindir *);
 529 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 530             ufs2_daddr_t);
 531 static  void handle_workitem_freefrag(struct freefrag *);
 532 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
 533 static  void allocdirect_merge(struct allocdirectlst *,
 534             struct allocdirect *, struct allocdirect *);
 535 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
 536 static  int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
 537             struct newblk **);
 538 static  int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
 539 static  int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
 540             struct inodedep **);
 541 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 542 static  int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
 543 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 544             struct mount *mp, int, struct pagedep **);
 545 static  void pause_timer(void *);
 546 static  int request_cleanup(struct mount *, int);
 547 static  int process_worklist_item(struct mount *, int);
 548 static  void add_to_worklist(struct worklist *);
 549 static  void softdep_flush(void);
 550 static  int softdep_speedup(void);
 551
 552 /*
 553  * Exported softdep operations.
 554  */
 555 static  void softdep_disk_io_initiation(struct buf *);
 556 static  void softdep_disk_write_complete(struct buf *);
 557 static  void softdep_deallocate_dependencies(struct buf *);
 558 static  int softdep_count_dependencies(struct buf *bp, int);
 559
 560 static struct mtx lk;
 561 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
 562
 563 #define TRY_ACQUIRE_LOCK(lk)            mtx_trylock(lk)
 564 #define ACQUIRE_LOCK(lk)                mtx_lock(lk)
 565 #define FREE_LOCK(lk)                   mtx_unlock(lk)
 566
 567 /*
 568  * Worklist queue management.
 569  * These routines require that the lock be held.
 570  */
 571 #ifndef /* NOT */ DEBUG
 572 #define WORKLIST_INSERT(head, item) do {        \
 573         (item)->wk_state |= ONWORKLIST;         \
 574         LIST_INSERT_HEAD(head, item, wk_list);  \
 575 } while (0)
 576 #define WORKLIST_REMOVE(item) do {              \
 577         (item)->wk_state &= ~ONWORKLIST;        \
 578         LIST_REMOVE(item, wk_list);             \
 579 } while (0)
 580 #else /* DEBUG */
 581 static  void worklist_insert(struct workhead *, struct worklist *);
 582 static  void worklist_remove(struct worklist *);
 583
 584 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
 585 #define WORKLIST_REMOVE(item) worklist_remove(item)
 586
 587 static void
 588 worklist_insert(head, item)
 589         struct workhead *head;
 590         struct worklist *item;
 591 {
 592
 593         mtx_assert(&lk, MA_OWNED);
 594         if (item->wk_state & ONWORKLIST)
 595                 panic("worklist_insert: already on list");
 596         item->wk_state |= ONWORKLIST;
 597         LIST_INSERT_HEAD(head, item, wk_list);
 598 }
 599
 600 static void
 601 worklist_remove(item)
 602         struct worklist *item;
 603 {
 604
 605         mtx_assert(&lk, MA_OWNED);
 606         if ((item->wk_state & ONWORKLIST) == 0)
 607                 panic("worklist_remove: not on list");
 608         item->wk_state &= ~ONWORKLIST;
 609         LIST_REMOVE(item, wk_list);
 610 }
 611 #endif /* DEBUG */
 612
 613 /*
 614  * Routines for tracking and managing workitems.
 615  */
 616 static  void workitem_free(struct worklist *, int);
 617 static  void workitem_alloc(struct worklist *, int, struct mount *);
 618
 619 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
 620
 621 static void
 622 workitem_free(item, type)
 623         struct worklist *item;
 624         int type;
 625 {
 626         struct ufsmount *ump;
 627         mtx_assert(&lk, MA_OWNED);
 628
 629 #ifdef DEBUG
 630         if (item->wk_state & ONWORKLIST)
 631                 panic("workitem_free: still on list");
 632         if (item->wk_type != type)
 633                 panic("workitem_free: type mismatch");
 634 #endif
 635         ump = VFSTOUFS(item->wk_mp);
 636         if (--ump->softdep_deps == 0 && ump->softdep_req)
 637                 wakeup(&ump->softdep_deps);
 638         FREE(item, DtoM(type));
 639 }
 640
 641 static void
 642 workitem_alloc(item, type, mp)
 643         struct worklist *item;
 644         int type;
 645         struct mount *mp;
 646 {
 647         item->wk_type = type;
 648         item->wk_mp = mp;
 649         item->wk_state = 0;
 650         ACQUIRE_LOCK(&lk);
 651         VFSTOUFS(mp)->softdep_deps++;
 652         VFSTOUFS(mp)->softdep_accdeps++;
 653         FREE_LOCK(&lk);
 654 }
 655
 656 /*
 657  * Workitem queue management
 658  */
 659 static int max_softdeps;        /* maximum number of structs before slowdown */
 660 static int maxindirdeps = 50;   /* max number of indirdeps before slowdown */
 661 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
 662 static int proc_waiting;        /* tracks whether we have a timeout posted */
 663 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
 664 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
 665 static int req_pending;
 666 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
 667 #define FLUSH_INODES            1
 668 static int req_clear_remove;    /* syncer process flush some freeblks */
 669 #define FLUSH_REMOVE            2
 670 #define FLUSH_REMOVE_WAIT       3
 671 /*
 672  * runtime statistics
 673  */
 674 static int stat_worklist_push;  /* number of worklist cleanups */
 675 static int stat_blk_limit_push; /* number of times block limit neared */
 676 static int stat_ino_limit_push; /* number of times inode limit neared */
 677 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
 678 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
 679 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
 680 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
 681 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
 682 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 683 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
 684
 685 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
 686 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
 687 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
 688 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
 689 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
 690 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
 691 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
 692 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
 693 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
 694 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
 695 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
 696 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
 697 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
 698 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
 699
 700 SYSCTL_DECL(_vfs_ffs);
 701
 702 static int compute_summary_at_mount = 0;        /* Whether to recompute the summary at mount time */
 703 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 704            &compute_summary_at_mount, 0, "Recompute summary at mount");
 705
 706 static struct proc *softdepproc;
 707 static struct kproc_desc softdep_kp = {
 708         "softdepflush",
 709         softdep_flush,
 710         &softdepproc
 711 };
 712 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp)
 713
 714 static void
 715 softdep_flush(void)
 716 {
 717         struct mount *nmp;
 718         struct mount *mp;
 719         struct ufsmount *ump;
 720         struct thread *td;
 721         int remaining;
 722         int vfslocked;
 723
 724         td = curthread;
 725         td->td_pflags |= TDP_NORUNNINGBUF;
 726
 727         for (;;) {
 728                 kproc_suspend_check(softdepproc);
 729                 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
 730                 ACQUIRE_LOCK(&lk);
 731                 /*
 732                  * If requested, try removing inode or removal dependencies.
 733                  */
 734                 if (req_clear_inodedeps) {
 735                         clear_inodedeps(td);
 736                         req_clear_inodedeps -= 1;
 737                         wakeup_one(&proc_waiting);
 738                 }
 739                 if (req_clear_remove) {
 740                         clear_remove(td);
 741                         req_clear_remove -= 1;
 742                         wakeup_one(&proc_waiting);
 743                 }
 744                 FREE_LOCK(&lk);
 745                 VFS_UNLOCK_GIANT(vfslocked);
 746                 remaining = 0;
 747                 mtx_lock(&mountlist_mtx);
 748                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
 749                         nmp = TAILQ_NEXT(mp, mnt_list);
 750                         if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
 751                                 continue;
 752                         if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 753                                 continue;
 754                         vfslocked = VFS_LOCK_GIANT(mp);
 755                         softdep_process_worklist(mp, 0);
 756                         ump = VFSTOUFS(mp);
 757                         remaining += ump->softdep_on_worklist -
 758                                 ump->softdep_on_worklist_inprogress;
 759                         VFS_UNLOCK_GIANT(vfslocked);
 760                         mtx_lock(&mountlist_mtx);
 761                         nmp = TAILQ_NEXT(mp, mnt_list);
 762                         vfs_unbusy(mp, td);
 763                 }
 764                 mtx_unlock(&mountlist_mtx);
 765                 if (remaining)
 766                         continue;
 767                 ACQUIRE_LOCK(&lk);
 768                 if (!req_pending)
 769                         msleep(&req_pending, &lk, PVM, "sdflush", hz);
 770                 req_pending = 0;
 771                 FREE_LOCK(&lk);
 772         }
 773 }
 774
 775 static int
 776 softdep_speedup(void)
 777 {
 778
 779         mtx_assert(&lk, MA_OWNED);
 780         if (req_pending == 0) {
 781                 req_pending = 1;
 782                 wakeup(&req_pending);
 783         }
 784
 785         return speedup_syncer();
 786 }
 787
 788 /*
 789  * Add an item to the end of the work queue.
 790  * This routine requires that the lock be held.
 791  * This is the only routine that adds items to the list.
 792  * The following routine is the only one that removes items
 793  * and does so in order from first to last.
 794  */
 795 static void
 796 add_to_worklist(wk)
 797         struct worklist *wk;
 798 {
 799         struct ufsmount *ump;
 800
 801         mtx_assert(&lk, MA_OWNED);
 802         ump = VFSTOUFS(wk->wk_mp);
 803         if (wk->wk_state & ONWORKLIST)
 804                 panic("add_to_worklist: already on list");
 805         wk->wk_state |= ONWORKLIST;
 806         if (LIST_EMPTY(&ump->softdep_workitem_pending))
 807                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 808         else
 809                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 810         ump->softdep_worklist_tail = wk;
 811         ump->softdep_on_worklist += 1;
 812 }
 813
 814 /*
 815  * Process that runs once per second to handle items in the background queue.
 816  *
 817  * Note that we ensure that everything is done in the order in which they
 818  * appear in the queue. The code below depends on this property to ensure
 819  * that blocks of a file are freed before the inode itself is freed. This
 820  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
 821  * until all the old ones have been purged from the dependency lists.
 822  */
 823 int
 824 softdep_process_worklist(mp, full)
 825         struct mount *mp;
 826         int full;
 827 {
 828         struct thread *td = curthread;
 829         int cnt, matchcnt, loopcount;
 830         struct ufsmount *ump;
 831         long starttime;
 832
 833         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 834         /*
 835          * Record the process identifier of our caller so that we can give
 836          * this process preferential treatment in request_cleanup below.
 837          */
 838         matchcnt = 0;
 839         ump = VFSTOUFS(mp);
 840         ACQUIRE_LOCK(&lk);
 841         loopcount = 1;
 842         starttime = time_second;
 843         while (ump->softdep_on_worklist > 0) {
 844                 if ((cnt = process_worklist_item(mp, 0)) == -1)
 845                         break;
 846                 else
 847                         matchcnt += cnt;
 848                 /*
 849                  * If requested, try removing inode or removal dependencies.
 850                  */
 851                 if (req_clear_inodedeps) {
 852                         clear_inodedeps(td);
 853                         req_clear_inodedeps -= 1;
 854                         wakeup_one(&proc_waiting);
 855                 }
 856                 if (req_clear_remove) {
 857                         clear_remove(td);
 858                         req_clear_remove -= 1;
 859                         wakeup_one(&proc_waiting);
 860                 }
 861                 /*
 862                  * We do not generally want to stop for buffer space, but if
 863                  * we are really being a buffer hog, we will stop and wait.
 864                  */
 865                 if (loopcount++ % 128 == 0) {
 866                         FREE_LOCK(&lk);
 867                         bwillwrite();
 868                         ACQUIRE_LOCK(&lk);
 869                 }
 870                 /*
 871                  * Never allow processing to run for more than one
 872                  * second. Otherwise the other mountpoints may get
 873                  * excessively backlogged.
 874                  */
 875                 if (!full && starttime != time_second) {
 876                         matchcnt = -1;
 877                         break;
 878                 }
 879         }
 880         FREE_LOCK(&lk);
 881         return (matchcnt);
 882 }
 883
 884 /*
 885  * Process one item on the worklist.
 886  */
 887 static int
 888 process_worklist_item(mp, flags)
 889         struct mount *mp;
 890         int flags;
 891 {
 892         struct worklist *wk, *wkend;
 893         struct ufsmount *ump;
 894         struct vnode *vp;
 895         int matchcnt = 0;
 896
 897         mtx_assert(&lk, MA_OWNED);
 898         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 899         /*
 900          * If we are being called because of a process doing a
 901          * copy-on-write, then it is not safe to write as we may
 902          * recurse into the copy-on-write routine.
 903          */
 904         if (curthread->td_pflags & TDP_COWINPROGRESS)
 905                 return (-1);
 906         /*
 907          * Normally we just process each item on the worklist in order.
 908          * However, if we are in a situation where we cannot lock any
 909          * inodes, we have to skip over any dirrem requests whose
 910          * vnodes are resident and locked.
 911          */
 912         ump = VFSTOUFS(mp);
 913         vp = NULL;
 914         LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
 915                 if (wk->wk_state & INPROGRESS)
 916                         continue;
 917                 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
 918                         break;
 919                 wk->wk_state |= INPROGRESS;
 920                 ump->softdep_on_worklist_inprogress++;
 921                 FREE_LOCK(&lk);
 922                 ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
 923                     LK_NOWAIT | LK_EXCLUSIVE, &vp);
 924                 ACQUIRE_LOCK(&lk);
 925                 wk->wk_state &= ~INPROGRESS;
 926                 ump->softdep_on_worklist_inprogress--;
 927                 if (vp != NULL)
 928                         break;
 929         }
 930         if (wk == 0)
 931                 return (-1);
 932         /*
 933          * Remove the item to be processed. If we are removing the last
 934          * item on the list, we need to recalculate the tail pointer.
 935          * As this happens rarely and usually when the list is short,
 936          * we just run down the list to find it rather than tracking it
 937          * in the above loop.
 938          */
 939         WORKLIST_REMOVE(wk);
 940         if (wk == ump->softdep_worklist_tail) {
 941                 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
 942                         if (LIST_NEXT(wkend, wk_list) == NULL)
 943                                 break;
 944                 ump->softdep_worklist_tail = wkend;
 945         }
 946         ump->softdep_on_worklist -= 1;
 947         FREE_LOCK(&lk);
 948         if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 949                 panic("process_worklist_item: suspended filesystem");
 950         matchcnt++;
 951         switch (wk->wk_type) {
 952
 953         case D_DIRREM:
 954                 /* removal of a directory entry */
 955                 handle_workitem_remove(WK_DIRREM(wk), vp);
 956                 break;
 957
 958         case D_FREEBLKS:
 959                 /* releasing blocks and/or fragments from a file */
 960                 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
 961                 break;
 962
 963         case D_FREEFRAG:
 964                 /* releasing a fragment when replaced as a file grows */
 965                 handle_workitem_freefrag(WK_FREEFRAG(wk));
 966                 break;
 967
 968         case D_FREEFILE:
 969                 /* releasing an inode when its link count drops to 0 */
 970                 handle_workitem_freefile(WK_FREEFILE(wk));
 971                 break;
 972
 973         default:
 974                 panic("%s_process_worklist: Unknown type %s",
 975                     "softdep", TYPENAME(wk->wk_type));
 976                 /* NOTREACHED */
 977         }
 978         vn_finished_secondary_write(mp);
 979         ACQUIRE_LOCK(&lk);
 980         return (matchcnt);
 981 }
 982
 983 /*
 984  * Move dependencies from one buffer to another.
 985  */
 986 void
 987 softdep_move_dependencies(oldbp, newbp)
 988         struct buf *oldbp;
 989         struct buf *newbp;
 990 {
 991         struct worklist *wk, *wktail;
 992
 993         if (!LIST_EMPTY(&newbp->b_dep))
 994                 panic("softdep_move_dependencies: need merge code");
 995         wktail = 0;
 996         ACQUIRE_LOCK(&lk);
 997         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 998                 LIST_REMOVE(wk, wk_list);
 999                 if (wktail == 0)
1000                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1001                 else
1002                         LIST_INSERT_AFTER(wktail, wk, wk_list);
1003                 wktail = wk;
1004         }
1005         FREE_LOCK(&lk);
1006 }
1007
1008 /*
1009  * Purge the work list of all items associated with a particular mount point.
1010  */
1011 int
1012 softdep_flushworklist(oldmnt, countp, td)
1013         struct mount *oldmnt;
1014         int *countp;
1015         struct thread *td;
1016 {
1017         struct vnode *devvp;
1018         int count, error = 0;
1019         struct ufsmount *ump;
1020
1021         /*
1022          * Alternately flush the block device associated with the mount
1023          * point and process any dependencies that the flushing
1024          * creates. We continue until no more worklist dependencies
1025          * are found.
1026          */
1027         *countp = 0;
1028         ump = VFSTOUFS(oldmnt);
1029         devvp = ump->um_devvp;
1030         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1031                 *countp += count;
1032                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
1033                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1034                 VOP_UNLOCK(devvp, 0, td);
1035                 if (error)
1036                         break;
1037         }
1038         return (error);
1039 }
1040
1041 int
1042 softdep_waitidle(struct mount *mp)
1043 {
1044         struct ufsmount *ump;
1045         int error;
1046         int i;
1047
1048         ump = VFSTOUFS(mp);
1049         ACQUIRE_LOCK(&lk);
1050         for (i = 0; i < 10 && ump->softdep_deps; i++) {
1051                 ump->softdep_req = 1;
1052                 if (ump->softdep_on_worklist)
1053                         panic("softdep_waitidle: work added after flush.");
1054                 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1055         }
1056         ump->softdep_req = 0;
1057         FREE_LOCK(&lk);
1058         error = 0;
1059         if (i == 10) {
1060                 error = EBUSY;
1061                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1062                     mp);
1063         }
1064
1065         return (error);
1066 }
1067
1068 /*
1069  * Flush all vnodes and worklist items associated with a specified mount point.
1070  */
1071 int
1072 softdep_flushfiles(oldmnt, flags, td)
1073         struct mount *oldmnt;
1074         int flags;
1075         struct thread *td;
1076 {
1077         int error, count, loopcnt;
1078
1079         error = 0;
1080
1081         /*
1082          * Alternately flush the vnodes associated with the mount
1083          * point and process any dependencies that the flushing
1084          * creates. In theory, this loop can happen at most twice,
1085          * but we give it a few extra just to be sure.
1086          */
1087         for (loopcnt = 10; loopcnt > 0; loopcnt--) {
1088                 /*
1089                  * Do another flush in case any vnodes were brought in
1090                  * as part of the cleanup operations.
1091                  */
1092                 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1093                         break;
1094                 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
1095                     count == 0)
1096                         break;
1097         }
1098         /*
1099          * If we are unmounting then it is an error to fail. If we
1100          * are simply trying to downgrade to read-only, then filesystem
1101          * activity can keep us busy forever, so we just fail with EBUSY.
1102          */
1103         if (loopcnt == 0) {
1104                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1105                         panic("softdep_flushfiles: looping");
1106                 error = EBUSY;
1107         }
1108         if (!error)
1109                 error = softdep_waitidle(oldmnt);
1110         return (error);
1111 }
1112
1113 /*
1114  * Structure hashing.
1115  *
1116  * There are three types of structures that can be looked up:
1117  *      1) pagedep structures identified by mount point, inode number,
1118  *         and logical block.
1119  *      2) inodedep structures identified by mount point and inode number.
1120  *      3) newblk structures identified by mount point and
1121  *         physical block number.
1122  *
1123  * The "pagedep" and "inodedep" dependency structures are hashed
1124  * separately from the file blocks and inodes to which they correspond.
1125  * This separation helps when the in-memory copy of an inode or
1126  * file block must be replaced. It also obviates the need to access
1127  * an inode or file page when simply updating (or de-allocating)
1128  * dependency structures. Lookup of newblk structures is needed to
1129  * find newly allocated blocks when trying to associate them with
1130  * their allocdirect or allocindir structure.
1131  *
1132  * The lookup routines optionally create and hash a new instance when
1133  * an existing entry is not found.
1134  */
1135 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
1136 #define NODELAY         0x0002  /* cannot do background work */
1137
1138 /*
1139  * Structures and routines associated with pagedep caching.
1140  */
1141 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1142 u_long  pagedep_hash;           /* size of hash table - 1 */
1143 #define PAGEDEP_HASH(mp, inum, lbn) \
1144         (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1145             pagedep_hash])
1146
1147 static int
1148 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1149         struct pagedep_hashhead *pagedephd;
1150         ino_t ino;
1151         ufs_lbn_t lbn;
1152         struct mount *mp;
1153         int flags;
1154         struct pagedep **pagedeppp;
1155 {
1156         struct pagedep *pagedep;
1157
1158         LIST_FOREACH(pagedep, pagedephd, pd_hash)
1159                 if (ino == pagedep->pd_ino &&
1160                     lbn == pagedep->pd_lbn &&
1161                     mp == pagedep->pd_list.wk_mp)
1162                         break;
1163         if (pagedep) {
1164                 *pagedeppp = pagedep;
1165                 if ((flags & DEPALLOC) != 0 &&
1166                     (pagedep->pd_state & ONWORKLIST) == 0)
1167                         return (0);
1168                 return (1);
1169         }
1170         *pagedeppp = NULL;
1171         return (0);
1172 }
1173 /*
1174  * Look up a pagedep. Return 1 if found, 0 if not found or found
1175  * when asked to allocate but not associated with any buffer.
1176  * If not found, allocate if DEPALLOC flag is passed.
1177  * Found or allocated entry is returned in pagedeppp.
1178  * This routine must be called with splbio interrupts blocked.
1179  */
1180 static int
1181 pagedep_lookup(ip, lbn, flags, pagedeppp)
1182         struct inode *ip;
1183         ufs_lbn_t lbn;
1184         int flags;
1185         struct pagedep **pagedeppp;
1186 {
1187         struct pagedep *pagedep;
1188         struct pagedep_hashhead *pagedephd;
1189         struct mount *mp;
1190         int ret;
1191         int i;
1192
1193         mtx_assert(&lk, MA_OWNED);
1194         mp = ITOV(ip)->v_mount;
1195         pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1196
1197         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1198         if (*pagedeppp || (flags & DEPALLOC) == 0)
1199                 return (ret);
1200         FREE_LOCK(&lk);
1201         MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
1202             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1203         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1204         ACQUIRE_LOCK(&lk);
1205         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1206         if (*pagedeppp) {
1207                 WORKITEM_FREE(pagedep, D_PAGEDEP);
1208                 return (ret);
1209         }
1210         pagedep->pd_ino = ip->i_number;
1211         pagedep->pd_lbn = lbn;
1212         LIST_INIT(&pagedep->pd_dirremhd);
1213         LIST_INIT(&pagedep->pd_pendinghd);
1214         for (i = 0; i < DAHASHSZ; i++)
1215                 LIST_INIT(&pagedep->pd_diraddhd[i]);
1216         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1217         *pagedeppp = pagedep;
1218         return (0);
1219 }
1220
1221 /*
1222  * Structures and routines associated with inodedep caching.
1223  */
1224 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1225 static u_long   inodedep_hash;  /* size of hash table - 1 */
1226 static long     num_inodedep;   /* number of inodedep allocated */
1227 #define INODEDEP_HASH(fs, inum) \
1228       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1229
1230 static int
1231 inodedep_find(inodedephd, fs, inum, inodedeppp)
1232         struct inodedep_hashhead *inodedephd;
1233         struct fs *fs;
1234         ino_t inum;
1235         struct inodedep **inodedeppp;
1236 {
1237         struct inodedep *inodedep;
1238
1239         LIST_FOREACH(inodedep, inodedephd, id_hash)
1240                 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1241                         break;
1242         if (inodedep) {
1243                 *inodedeppp = inodedep;
1244                 return (1);
1245         }
1246         *inodedeppp = NULL;
1247
1248         return (0);
1249 }
1250 /*
1251  * Look up an inodedep. Return 1 if found, 0 if not found.
1252  * If not found, allocate if DEPALLOC flag is passed.
1253  * Found or allocated entry is returned in inodedeppp.
1254  * This routine must be called with splbio interrupts blocked.
1255  */
1256 static int
1257 inodedep_lookup(mp, inum, flags, inodedeppp)
1258         struct mount *mp;
1259         ino_t inum;
1260         int flags;
1261         struct inodedep **inodedeppp;
1262 {
1263         struct inodedep *inodedep;
1264         struct inodedep_hashhead *inodedephd;
1265         struct fs *fs;
1266
1267         mtx_assert(&lk, MA_OWNED);
1268         fs = VFSTOUFS(mp)->um_fs;
1269         inodedephd = INODEDEP_HASH(fs, inum);
1270
1271         if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1272                 return (1);
1273         if ((flags & DEPALLOC) == 0)
1274                 return (0);
1275         /*
1276          * If we are over our limit, try to improve the situation.
1277          */
1278         if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1279                 request_cleanup(mp, FLUSH_INODES);
1280         FREE_LOCK(&lk);
1281         MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1282                 M_INODEDEP, M_SOFTDEP_FLAGS);
1283         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1284         ACQUIRE_LOCK(&lk);
1285         if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1286                 WORKITEM_FREE(inodedep, D_INODEDEP);
1287                 return (1);
1288         }
1289         num_inodedep += 1;
1290         inodedep->id_fs = fs;
1291         inodedep->id_ino = inum;
1292         inodedep->id_state = ALLCOMPLETE;
1293         inodedep->id_nlinkdelta = 0;
1294         inodedep->id_savedino1 = NULL;
1295         inodedep->id_savedsize = -1;
1296         inodedep->id_savedextsize = -1;
1297         inodedep->id_buf = NULL;
1298         LIST_INIT(&inodedep->id_pendinghd);
1299         LIST_INIT(&inodedep->id_inowait);
1300         LIST_INIT(&inodedep->id_bufwait);
1301         TAILQ_INIT(&inodedep->id_inoupdt);
1302         TAILQ_INIT(&inodedep->id_newinoupdt);
1303         TAILQ_INIT(&inodedep->id_extupdt);
1304         TAILQ_INIT(&inodedep->id_newextupdt);
1305         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1306         *inodedeppp = inodedep;
1307         return (0);
1308 }
1309
1310 /*
1311  * Structures and routines associated with newblk caching.
1312  */
1313 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1314 u_long  newblk_hash;            /* size of hash table - 1 */
1315 #define NEWBLK_HASH(fs, inum) \
1316         (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1317
1318 static int
1319 newblk_find(newblkhd, fs, newblkno, newblkpp)
1320         struct newblk_hashhead *newblkhd;
1321         struct fs *fs;
1322         ufs2_daddr_t newblkno;
1323         struct newblk **newblkpp;
1324 {
1325         struct newblk *newblk;
1326
1327         LIST_FOREACH(newblk, newblkhd, nb_hash)
1328                 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1329                         break;
1330         if (newblk) {
1331                 *newblkpp = newblk;
1332                 return (1);
1333         }
1334         *newblkpp = NULL;
1335         return (0);
1336 }
1337
1338 /*
1339  * Look up a newblk. Return 1 if found, 0 if not found.
1340  * If not found, allocate if DEPALLOC flag is passed.
1341  * Found or allocated entry is returned in newblkpp.
1342  */
1343 static int
1344 newblk_lookup(fs, newblkno, flags, newblkpp)
1345         struct fs *fs;
1346         ufs2_daddr_t newblkno;
1347         int flags;
1348         struct newblk **newblkpp;
1349 {
1350         struct newblk *newblk;
1351         struct newblk_hashhead *newblkhd;
1352
1353         newblkhd = NEWBLK_HASH(fs, newblkno);
1354         if (newblk_find(newblkhd, fs, newblkno, newblkpp))
1355                 return (1);
1356         if ((flags & DEPALLOC) == 0)
1357                 return (0);
1358         FREE_LOCK(&lk);
1359         MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1360                 M_NEWBLK, M_SOFTDEP_FLAGS);
1361         ACQUIRE_LOCK(&lk);
1362         if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
1363                 FREE(newblk, M_NEWBLK);
1364                 return (1);
1365         }
1366         newblk->nb_state = 0;
1367         newblk->nb_fs = fs;
1368         newblk->nb_newblkno = newblkno;
1369         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1370         *newblkpp = newblk;
1371         return (0);
1372 }
1373
1374 /*
1375  * Executed during filesystem system initialization before
1376  * mounting any filesystems.
1377  */
1378 void
1379 softdep_initialize()
1380 {
1381
1382         LIST_INIT(&mkdirlisthd);
1383         max_softdeps = desiredvnodes * 4;
1384         pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1385             &pagedep_hash);
1386         inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1387         newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1388
1389         /* initialise bioops hack */
1390         bioops.io_start = softdep_disk_io_initiation;
1391         bioops.io_complete = softdep_disk_write_complete;
1392         bioops.io_deallocate = softdep_deallocate_dependencies;
1393         bioops.io_countdeps = softdep_count_dependencies;
1394 }
1395
1396 /*
1397  * Executed after all filesystems have been unmounted during
1398  * filesystem module unload.
1399  */
1400 void
1401 softdep_uninitialize()
1402 {
1403
1404         hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1405         hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1406         hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1407 }
1408
1409 /*
1410  * Called at mount time to notify the dependency code that a
1411  * filesystem wishes to use it.
1412  */
1413 int
1414 softdep_mount(devvp, mp, fs, cred)
1415         struct vnode *devvp;
1416         struct mount *mp;
1417         struct fs *fs;
1418         struct ucred *cred;
1419 {
1420         struct csum_total cstotal;
1421         struct ufsmount *ump;
1422         struct cg *cgp;
1423         struct buf *bp;
1424         int error, cyl;
1425
1426         MNT_ILOCK(mp);
1427         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
1428         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
1429                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
1430                         MNTK_SOFTDEP;
1431                 mp->mnt_noasync++;
1432         }
1433         MNT_IUNLOCK(mp);
1434         ump = VFSTOUFS(mp);
1435         LIST_INIT(&ump->softdep_workitem_pending);
1436         ump->softdep_worklist_tail = NULL;
1437         ump->softdep_on_worklist = 0;
1438         ump->softdep_deps = 0;
1439         /*
1440          * When doing soft updates, the counters in the
1441          * superblock may have gotten out of sync. Recomputation
1442          * can take a long time and can be deferred for background
1443          * fsck.  However, the old behavior of scanning the cylinder
1444          * groups and recalculating them at mount time is available
1445          * by setting vfs.ffs.compute_summary_at_mount to one.
1446          */
1447         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
1448                 return (0);
1449         bzero(&cstotal, sizeof cstotal);
1450         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1451                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1452                     fs->fs_cgsize, cred, &bp)) != 0) {
1453                         brelse(bp);
1454                         return (error);
1455                 }
1456                 cgp = (struct cg *)bp->b_data;
1457                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1458                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1459                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1460                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1461                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1462                 brelse(bp);
1463         }
1464 #ifdef DEBUG
1465         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1466                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1467 #endif
1468         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1469         return (0);
1470 }
1471
1472 /*
1473  * Protecting the freemaps (or bitmaps).
1474  *
1475  * To eliminate the need to execute fsck before mounting a filesystem
1476  * after a power failure, one must (conservatively) guarantee that the
1477  * on-disk copy of the bitmaps never indicate that a live inode or block is
1478  * free.  So, when a block or inode is allocated, the bitmap should be
1479  * updated (on disk) before any new pointers.  When a block or inode is
1480  * freed, the bitmap should not be updated until all pointers have been
1481  * reset.  The latter dependency is handled by the delayed de-allocation
1482  * approach described below for block and inode de-allocation.  The former
1483  * dependency is handled by calling the following procedure when a block or
1484  * inode is allocated. When an inode is allocated an "inodedep" is created
1485  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1486  * Each "inodedep" is also inserted into the hash indexing structure so
1487  * that any additional link additions can be made dependent on the inode
1488  * allocation.
1489  *
1490  * The ufs filesystem maintains a number of free block counts (e.g., per
1491  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1492  * in addition to the bitmaps.  These counts are used to improve efficiency
1493  * during allocation and therefore must be consistent with the bitmaps.
1494  * There is no convenient way to guarantee post-crash consistency of these
1495  * counts with simple update ordering, for two main reasons: (1) The counts
1496  * and bitmaps for a single cylinder group block are not in the same disk
1497  * sector.  If a disk write is interrupted (e.g., by power failure), one may
1498  * be written and the other not.  (2) Some of the counts are located in the
1499  * superblock rather than the cylinder group block. So, we focus our soft
1500  * updates implementation on protecting the bitmaps. When mounting a
1501  * filesystem, we recompute the auxiliary counts from the bitmaps.
1502  */
1503
1504 /*
1505  * Called just after updating the cylinder group block to allocate an inode.
1506  */
1507 void
1508 softdep_setup_inomapdep(bp, ip, newinum)
1509         struct buf *bp;         /* buffer for cylgroup block with inode map */
1510         struct inode *ip;       /* inode related to allocation */
1511         ino_t newinum;          /* new inode number being allocated */
1512 {
1513         struct inodedep *inodedep;
1514         struct bmsafemap *bmsafemap;
1515
1516         /*
1517          * Create a dependency for the newly allocated inode.
1518          * Panic if it already exists as something is seriously wrong.
1519          * Otherwise add it to the dependency list for the buffer holding
1520          * the cylinder group map from which it was allocated.
1521          */
1522         ACQUIRE_LOCK(&lk);
1523         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
1524             &inodedep)))
1525                 panic("softdep_setup_inomapdep: dependency for new inode "
1526                     "already exists");
1527         inodedep->id_buf = bp;
1528         inodedep->id_state &= ~DEPCOMPLETE;
1529         bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
1530         LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1531         FREE_LOCK(&lk);
1532 }
1533
1534 /*
1535  * Called just after updating the cylinder group block to
1536  * allocate block or fragment.
1537  */
1538 void
1539 softdep_setup_blkmapdep(bp, mp, newblkno)
1540         struct buf *bp;         /* buffer for cylgroup block with block map */
1541         struct mount *mp;       /* filesystem doing allocation */
1542         ufs2_daddr_t newblkno;  /* number of newly allocated block */
1543 {
1544         struct newblk *newblk;
1545         struct bmsafemap *bmsafemap;
1546         struct fs *fs;
1547
1548         fs = VFSTOUFS(mp)->um_fs;
1549         /*
1550          * Create a dependency for the newly allocated block.
1551          * Add it to the dependency list for the buffer holding
1552          * the cylinder group map from which it was allocated.
1553          */
1554         ACQUIRE_LOCK(&lk);
1555         if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1556                 panic("softdep_setup_blkmapdep: found block");
1557         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
1558         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1559         FREE_LOCK(&lk);
1560 }
1561
1562 /*
1563  * Find the bmsafemap associated with a cylinder group buffer.
1564  * If none exists, create one. The buffer must be locked when
1565  * this routine is called and this routine must be called with
1566  * splbio interrupts blocked.
1567  */
1568 static struct bmsafemap *
1569 bmsafemap_lookup(mp, bp)
1570         struct mount *mp;
1571         struct buf *bp;
1572 {
1573         struct bmsafemap *bmsafemap;
1574         struct worklist *wk;
1575
1576         mtx_assert(&lk, MA_OWNED);
1577         LIST_FOREACH(wk, &bp->b_dep, wk_list)
1578                 if (wk->wk_type == D_BMSAFEMAP)
1579                         return (WK_BMSAFEMAP(wk));
1580         FREE_LOCK(&lk);
1581         MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1582                 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1583         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
1584         bmsafemap->sm_buf = bp;
1585         LIST_INIT(&bmsafemap->sm_allocdirecthd);
1586         LIST_INIT(&bmsafemap->sm_allocindirhd);
1587         LIST_INIT(&bmsafemap->sm_inodedephd);
1588         LIST_INIT(&bmsafemap->sm_newblkhd);
1589         ACQUIRE_LOCK(&lk);
1590         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1591         return (bmsafemap);
1592 }
1593
1594 /*
1595  * Direct block allocation dependencies.
1596  *
1597  * When a new block is allocated, the corresponding disk locations must be
1598  * initialized (with zeros or new data) before the on-disk inode points to
1599  * them.  Also, the freemap from which the block was allocated must be
1600  * updated (on disk) before the inode's pointer. These two dependencies are
1601  * independent of each other and are needed for all file blocks and indirect
1602  * blocks that are pointed to directly by the inode.  Just before the
1603  * "in-core" version of the inode is updated with a newly allocated block
1604  * number, a procedure (below) is called to setup allocation dependency
1605  * structures.  These structures are removed when the corresponding
1606  * dependencies are satisfied or when the block allocation becomes obsolete
1607  * (i.e., the file is deleted, the block is de-allocated, or the block is a
1608  * fragment that gets upgraded).  All of these cases are handled in
1609  * procedures described later.
1610  *
1611  * When a file extension causes a fragment to be upgraded, either to a larger
1612  * fragment or to a full block, the on-disk location may change (if the
1613  * previous fragment could not simply be extended). In this case, the old
1614  * fragment must be de-allocated, but not until after the inode's pointer has
1615  * been updated. In most cases, this is handled by later procedures, which
1616  * will construct a "freefrag" structure to be added to the workitem queue
1617  * when the inode update is complete (or obsolete).  The main exception to
1618  * this is when an allocation occurs while a pending allocation dependency
1619  * (for the same block pointer) remains.  This case is handled in the main
1620  * allocation dependency setup procedure by immediately freeing the
1621  * unreferenced fragments.
1622  */
1623 void
1624 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1625         struct inode *ip;       /* inode to which block is being added */
1626         ufs_lbn_t lbn;          /* block pointer within inode */
1627         ufs2_daddr_t newblkno;  /* disk block number being added */
1628         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
1629         long newsize;           /* size of new block */
1630         long oldsize;           /* size of new block */
1631         struct buf *bp;         /* bp for allocated block */
1632 {
1633         struct allocdirect *adp, *oldadp;
1634         struct allocdirectlst *adphead;
1635         struct bmsafemap *bmsafemap;
1636         struct inodedep *inodedep;
1637         struct pagedep *pagedep;
1638         struct newblk *newblk;
1639         struct mount *mp;
1640
1641         mp = UFSTOVFS(ip->i_ump);
1642         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1643                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1644         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1645         adp->ad_lbn = lbn;
1646         adp->ad_newblkno = newblkno;
1647         adp->ad_oldblkno = oldblkno;
1648         adp->ad_newsize = newsize;
1649         adp->ad_oldsize = oldsize;
1650         adp->ad_state = ATTACHED;
1651         LIST_INIT(&adp->ad_newdirblk);
1652         if (newblkno == oldblkno)
1653                 adp->ad_freefrag = NULL;
1654         else
1655                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1656
1657         ACQUIRE_LOCK(&lk);
1658         if (lbn >= NDADDR) {
1659                 /* allocating an indirect block */
1660                 if (oldblkno != 0)
1661                         panic("softdep_setup_allocdirect: non-zero indir");
1662         } else {
1663                 /*
1664                  * Allocating a direct block.
1665                  *
1666                  * If we are allocating a directory block, then we must
1667                  * allocate an associated pagedep to track additions and
1668                  * deletions.
1669                  */
1670                 if ((ip->i_mode & IFMT) == IFDIR &&
1671                     pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1672                         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1673         }
1674         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1675                 panic("softdep_setup_allocdirect: lost block");
1676         if (newblk->nb_state == DEPCOMPLETE) {
1677                 adp->ad_state |= DEPCOMPLETE;
1678                 adp->ad_buf = NULL;
1679         } else {
1680                 bmsafemap = newblk->nb_bmsafemap;
1681                 adp->ad_buf = bmsafemap->sm_buf;
1682                 LIST_REMOVE(newblk, nb_deps);
1683                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1684         }
1685         LIST_REMOVE(newblk, nb_hash);
1686         FREE(newblk, M_NEWBLK);
1687
1688         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1689         adp->ad_inodedep = inodedep;
1690         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1691         /*
1692          * The list of allocdirects must be kept in sorted and ascending
1693          * order so that the rollback routines can quickly determine the
1694          * first uncommitted block (the size of the file stored on disk
1695          * ends at the end of the lowest committed fragment, or if there
1696          * are no fragments, at the end of the highest committed block).
1697          * Since files generally grow, the typical case is that the new
1698          * block is to be added at the end of the list. We speed this
1699          * special case by checking against the last allocdirect in the
1700          * list before laboriously traversing the list looking for the
1701          * insertion point.
1702          */
1703         adphead = &inodedep->id_newinoupdt;
1704         oldadp = TAILQ_LAST(adphead, allocdirectlst);
1705         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1706                 /* insert at end of list */
1707                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1708                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1709                         allocdirect_merge(adphead, adp, oldadp);
1710                 FREE_LOCK(&lk);
1711                 return;
1712         }
1713         TAILQ_FOREACH(oldadp, adphead, ad_next) {
1714                 if (oldadp->ad_lbn >= lbn)
1715                         break;
1716         }
1717         if (oldadp == NULL)
1718                 panic("softdep_setup_allocdirect: lost entry");
1719         /* insert in middle of list */
1720         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1721         if (oldadp->ad_lbn == lbn)
1722                 allocdirect_merge(adphead, adp, oldadp);
1723         FREE_LOCK(&lk);
1724 }
1725
1726 /*
1727  * Replace an old allocdirect dependency with a newer one.
1728  * This routine must be called with splbio interrupts blocked.
1729  */
1730 static void
1731 allocdirect_merge(adphead, newadp, oldadp)
1732         struct allocdirectlst *adphead; /* head of list holding allocdirects */
1733         struct allocdirect *newadp;     /* allocdirect being added */
1734         struct allocdirect *oldadp;     /* existing allocdirect being checked */
1735 {
1736         struct worklist *wk;
1737         struct freefrag *freefrag;
1738         struct newdirblk *newdirblk;
1739
1740         mtx_assert(&lk, MA_OWNED);
1741         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1742             newadp->ad_oldsize != oldadp->ad_newsize ||
1743             newadp->ad_lbn >= NDADDR)
1744                 panic("%s %jd != new %jd || old size %ld != new %ld",
1745                     "allocdirect_merge: old blkno",
1746                     (intmax_t)newadp->ad_oldblkno,
1747                     (intmax_t)oldadp->ad_newblkno,
1748                     newadp->ad_oldsize, oldadp->ad_newsize);
1749         newadp->ad_oldblkno = oldadp->ad_oldblkno;
1750         newadp->ad_oldsize = oldadp->ad_oldsize;
1751         /*
1752          * If the old dependency had a fragment to free or had never
1753          * previously had a block allocated, then the new dependency
1754          * can immediately post its freefrag and adopt the old freefrag.
1755          * This action is done by swapping the freefrag dependencies.
1756          * The new dependency gains the old one's freefrag, and the
1757          * old one gets the new one and then immediately puts it on
1758          * the worklist when it is freed by free_allocdirect. It is
1759          * not possible to do this swap when the old dependency had a
1760          * non-zero size but no previous fragment to free. This condition
1761          * arises when the new block is an extension of the old block.
1762          * Here, the first part of the fragment allocated to the new
1763          * dependency is part of the block currently claimed on disk by
1764          * the old dependency, so cannot legitimately be freed until the
1765          * conditions for the new dependency are fulfilled.
1766          */
1767         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1768                 freefrag = newadp->ad_freefrag;
1769                 newadp->ad_freefrag = oldadp->ad_freefrag;
1770                 oldadp->ad_freefrag = freefrag;
1771         }
1772         /*
1773          * If we are tracking a new directory-block allocation,
1774          * move it from the old allocdirect to the new allocdirect.
1775          */
1776         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1777                 newdirblk = WK_NEWDIRBLK(wk);
1778                 WORKLIST_REMOVE(&newdirblk->db_list);
1779                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
1780                         panic("allocdirect_merge: extra newdirblk");
1781                 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1782         }
1783         free_allocdirect(adphead, oldadp, 0);
1784 }
1785
1786 /*
1787  * Allocate a new freefrag structure if needed.
1788  */
1789 static struct freefrag *
1790 newfreefrag(ip, blkno, size)
1791         struct inode *ip;
1792         ufs2_daddr_t blkno;
1793         long size;
1794 {
1795         struct freefrag *freefrag;
1796         struct fs *fs;
1797
1798         if (blkno == 0)
1799                 return (NULL);
1800         fs = ip->i_fs;
1801         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1802                 panic("newfreefrag: frag size");
1803         MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1804                 M_FREEFRAG, M_SOFTDEP_FLAGS);
1805         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
1806         freefrag->ff_inum = ip->i_number;
1807         freefrag->ff_blkno = blkno;
1808         freefrag->ff_fragsize = size;
1809         return (freefrag);
1810 }
1811
1812 /*
1813  * This workitem de-allocates fragments that were replaced during
1814  * file block allocation.
1815  */
1816 static void
1817 handle_workitem_freefrag(freefrag)
1818         struct freefrag *freefrag;
1819 {
1820         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
1821
1822         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1823             freefrag->ff_fragsize, freefrag->ff_inum);
1824         ACQUIRE_LOCK(&lk);
1825         WORKITEM_FREE(freefrag, D_FREEFRAG);
1826         FREE_LOCK(&lk);
1827 }
1828
1829 /*
1830  * Set up a dependency structure for an external attributes data block.
1831  * This routine follows much of the structure of softdep_setup_allocdirect.
1832  * See the description of softdep_setup_allocdirect above for details.
1833  */
1834 void
1835 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1836         struct inode *ip;
1837         ufs_lbn_t lbn;
1838         ufs2_daddr_t newblkno;
1839         ufs2_daddr_t oldblkno;
1840         long newsize;
1841         long oldsize;
1842         struct buf *bp;
1843 {
1844         struct allocdirect *adp, *oldadp;
1845         struct allocdirectlst *adphead;
1846         struct bmsafemap *bmsafemap;
1847         struct inodedep *inodedep;
1848         struct newblk *newblk;
1849         struct mount *mp;
1850
1851         mp = UFSTOVFS(ip->i_ump);
1852         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1853                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1854         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1855         adp->ad_lbn = lbn;
1856         adp->ad_newblkno = newblkno;
1857         adp->ad_oldblkno = oldblkno;
1858         adp->ad_newsize = newsize;
1859         adp->ad_oldsize = oldsize;
1860         adp->ad_state = ATTACHED | EXTDATA;
1861         LIST_INIT(&adp->ad_newdirblk);
1862         if (newblkno == oldblkno)
1863                 adp->ad_freefrag = NULL;
1864         else
1865                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1866
1867         ACQUIRE_LOCK(&lk);
1868         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1869                 panic("softdep_setup_allocext: lost block");
1870
1871         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1872         adp->ad_inodedep = inodedep;
1873
1874         if (newblk->nb_state == DEPCOMPLETE) {
1875                 adp->ad_state |= DEPCOMPLETE;
1876                 adp->ad_buf = NULL;
1877         } else {
1878                 bmsafemap = newblk->nb_bmsafemap;
1879                 adp->ad_buf = bmsafemap->sm_buf;
1880                 LIST_REMOVE(newblk, nb_deps);
1881                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1882         }
1883         LIST_REMOVE(newblk, nb_hash);
1884         FREE(newblk, M_NEWBLK);
1885
1886         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1887         if (lbn >= NXADDR)
1888                 panic("softdep_setup_allocext: lbn %lld > NXADDR",
1889                     (long long)lbn);
1890         /*
1891          * The list of allocdirects must be kept in sorted and ascending
1892          * order so that the rollback routines can quickly determine the
1893          * first uncommitted block (the size of the file stored on disk
1894          * ends at the end of the lowest committed fragment, or if there
1895          * are no fragments, at the end of the highest committed block).
1896          * Since files generally grow, the typical case is that the new
1897          * block is to be added at the end of the list. We speed this
1898          * special case by checking against the last allocdirect in the
1899          * list before laboriously traversing the list looking for the
1900          * insertion point.
1901          */
1902         adphead = &inodedep->id_newextupdt;
1903         oldadp = TAILQ_LAST(adphead, allocdirectlst);
1904         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1905                 /* insert at end of list */
1906                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1907                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1908                         allocdirect_merge(adphead, adp, oldadp);
1909                 FREE_LOCK(&lk);
1910                 return;
1911         }
1912         TAILQ_FOREACH(oldadp, adphead, ad_next) {
1913                 if (oldadp->ad_lbn >= lbn)
1914                         break;
1915         }
1916         if (oldadp == NULL)
1917                 panic("softdep_setup_allocext: lost entry");
1918         /* insert in middle of list */
1919         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1920         if (oldadp->ad_lbn == lbn)
1921                 allocdirect_merge(adphead, adp, oldadp);
1922         FREE_LOCK(&lk);
1923 }
1924
1925 /*
1926  * Indirect block allocation dependencies.
1927  *
1928  * The same dependencies that exist for a direct block also exist when
1929  * a new block is allocated and pointed to by an entry in a block of
1930  * indirect pointers. The undo/redo states described above are also
1931  * used here. Because an indirect block contains many pointers that
1932  * may have dependencies, a second copy of the entire in-memory indirect
1933  * block is kept. The buffer cache copy is always completely up-to-date.
1934  * The second copy, which is used only as a source for disk writes,
1935  * contains only the safe pointers (i.e., those that have no remaining
1936  * update dependencies). The second copy is freed when all pointers
1937  * are safe. The cache is not allowed to replace indirect blocks with
1938  * pending update dependencies. If a buffer containing an indirect
1939  * block with dependencies is written, these routines will mark it
1940  * dirty again. It can only be successfully written once all the
1941  * dependencies are removed. The ffs_fsync routine in conjunction with
1942  * softdep_sync_metadata work together to get all the dependencies
1943  * removed so that a file can be successfully written to disk. Three
1944  * procedures are used when setting up indirect block pointer
1945  * dependencies. The division is necessary because of the organization
1946  * of the "balloc" routine and because of the distinction between file
1947  * pages and file metadata blocks.
1948  */
1949
1950 /*
1951  * Allocate a new allocindir structure.
1952  */
1953 static struct allocindir *
1954 newallocindir(ip, ptrno, newblkno, oldblkno)
1955         struct inode *ip;       /* inode for file being extended */
1956         int ptrno;              /* offset of pointer in indirect block */
1957         ufs2_daddr_t newblkno;  /* disk block number being added */
1958         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
1959 {
1960         struct allocindir *aip;
1961
1962         MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1963                 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1964         workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
1965         aip->ai_state = ATTACHED;
1966         aip->ai_offset = ptrno;
1967         aip->ai_newblkno = newblkno;
1968         aip->ai_oldblkno = oldblkno;
1969         aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1970         return (aip);
1971 }
1972
1973 /*
1974  * Called just before setting an indirect block pointer
1975  * to a newly allocated file page.
1976  */
1977 void
1978 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1979         struct inode *ip;       /* inode for file being extended */
1980         ufs_lbn_t lbn;          /* allocated block number within file */
1981         struct buf *bp;         /* buffer with indirect blk referencing page */
1982         int ptrno;              /* offset of pointer in indirect block */
1983         ufs2_daddr_t newblkno;  /* disk block number being added */
1984         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
1985         struct buf *nbp;        /* buffer holding allocated page */
1986 {
1987         struct allocindir *aip;
1988         struct pagedep *pagedep;
1989
1990         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
1991         aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1992         ACQUIRE_LOCK(&lk);
1993         /*
1994          * If we are allocating a directory page, then we must
1995          * allocate an associated pagedep to track additions and
1996          * deletions.
1997          */
1998         if ((ip->i_mode & IFMT) == IFDIR &&
1999             pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
2000                 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
2001         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2002         setup_allocindir_phase2(bp, ip, aip);
2003         FREE_LOCK(&lk);
2004 }
2005
2006 /*
2007  * Called just before setting an indirect block pointer to a
2008  * newly allocated indirect block.
2009  */
2010 void
2011 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
2012         struct buf *nbp;        /* newly allocated indirect block */
2013         struct inode *ip;       /* inode for file being extended */
2014         struct buf *bp;         /* indirect block referencing allocated block */
2015         int ptrno;              /* offset of pointer in indirect block */
2016         ufs2_daddr_t newblkno;  /* disk block number being added */
2017 {
2018         struct allocindir *aip;
2019
2020         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
2021         aip = newallocindir(ip, ptrno, newblkno, 0);
2022         ACQUIRE_LOCK(&lk);
2023         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2024         setup_allocindir_phase2(bp, ip, aip);
2025         FREE_LOCK(&lk);
2026 }
2027
2028 /*
2029  * Called to finish the allocation of the "aip" allocated
2030  * by one of the two routines above.
2031  */
2032 static void
2033 setup_allocindir_phase2(bp, ip, aip)
2034         struct buf *bp;         /* in-memory copy of the indirect block */
2035         struct inode *ip;       /* inode for file being extended */
2036         struct allocindir *aip; /* allocindir allocated by the above routines */
2037 {
2038         struct worklist *wk;
2039         struct indirdep *indirdep, *newindirdep;
2040         struct bmsafemap *bmsafemap;
2041         struct allocindir *oldaip;
2042         struct freefrag *freefrag;
2043         struct newblk *newblk;
2044         ufs2_daddr_t blkno;
2045
2046         mtx_assert(&lk, MA_OWNED);
2047         if (bp->b_lblkno >= 0)
2048                 panic("setup_allocindir_phase2: not indir blk");
2049         for (indirdep = NULL, newindirdep = NULL; ; ) {
2050                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2051                         if (wk->wk_type != D_INDIRDEP)
2052                                 continue;
2053                         indirdep = WK_INDIRDEP(wk);
2054                         break;
2055                 }
2056                 if (indirdep == NULL && newindirdep) {
2057                         indirdep = newindirdep;
2058                         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
2059                         newindirdep = NULL;
2060                 }
2061                 if (indirdep) {
2062                         if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
2063                             &newblk) == 0)
2064                                 panic("setup_allocindir: lost block");
2065                         if (newblk->nb_state == DEPCOMPLETE) {
2066                                 aip->ai_state |= DEPCOMPLETE;
2067                                 aip->ai_buf = NULL;
2068                         } else {
2069                                 bmsafemap = newblk->nb_bmsafemap;
2070                                 aip->ai_buf = bmsafemap->sm_buf;
2071                                 LIST_REMOVE(newblk, nb_deps);
2072                                 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
2073                                     aip, ai_deps);
2074                         }
2075                         LIST_REMOVE(newblk, nb_hash);
2076                         FREE(newblk, M_NEWBLK);
2077                         aip->ai_indirdep = indirdep;
2078                         /*
2079                          * Check to see if there is an existing dependency
2080                          * for this block. If there is, merge the old
2081                          * dependency into the new one.
2082                          */
2083                         if (aip->ai_oldblkno == 0)
2084                                 oldaip = NULL;
2085                         else
2086
2087                                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
2088                                         if (oldaip->ai_offset == aip->ai_offset)
2089                                                 break;
2090                         freefrag = NULL;
2091                         if (oldaip != NULL) {
2092                                 if (oldaip->ai_newblkno != aip->ai_oldblkno)
2093                                         panic("setup_allocindir_phase2: blkno");
2094                                 aip->ai_oldblkno = oldaip->ai_oldblkno;
2095                                 freefrag = aip->ai_freefrag;
2096                                 aip->ai_freefrag = oldaip->ai_freefrag;
2097                                 oldaip->ai_freefrag = NULL;
2098                                 free_allocindir(oldaip, NULL);
2099                         }
2100                         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
2101                         if (ip->i_ump->um_fstype == UFS1)
2102                                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
2103                                     [aip->ai_offset] = aip->ai_oldblkno;
2104                         else
2105                                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
2106                                     [aip->ai_offset] = aip->ai_oldblkno;
2107                         FREE_LOCK(&lk);
2108                         if (freefrag != NULL)
2109                                 handle_workitem_freefrag(freefrag);
2110                 } else
2111                         FREE_LOCK(&lk);
2112                 if (newindirdep) {
2113                         newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2114                         brelse(newindirdep->ir_savebp);
2115                         ACQUIRE_LOCK(&lk);
2116                         WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
2117                         if (indirdep)
2118                                 break;
2119                         FREE_LOCK(&lk);
2120                 }
2121                 if (indirdep) {
2122                         ACQUIRE_LOCK(&lk);
2123                         break;
2124                 }
2125                 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
2126                         M_INDIRDEP, M_SOFTDEP_FLAGS);
2127                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
2128                     UFSTOVFS(ip->i_ump));
2129                 newindirdep->ir_state = ATTACHED;
2130                 if (ip->i_ump->um_fstype == UFS1)
2131                         newindirdep->ir_state |= UFS1FMT;
2132                 LIST_INIT(&newindirdep->ir_deplisthd);
2133                 LIST_INIT(&newindirdep->ir_donehd);
2134                 if (bp->b_blkno == bp->b_lblkno) {
2135                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
2136                             NULL, NULL);
2137                         bp->b_blkno = blkno;
2138                 }
2139                 newindirdep->ir_savebp =
2140                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
2141                 BUF_KERNPROC(newindirdep->ir_savebp);
2142                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
2143                 ACQUIRE_LOCK(&lk);
2144         }
2145 }
2146
2147 /*
2148  * Block de-allocation dependencies.
2149  *
2150  * When blocks are de-allocated, the on-disk pointers must be nullified before
2151  * the blocks are made available for use by other files.  (The true
2152  * requirement is that old pointers must be nullified before new on-disk
2153  * pointers are set.  We chose this slightly more stringent requirement to
2154  * reduce complexity.) Our implementation handles this dependency by updating
2155  * the inode (or indirect block) appropriately but delaying the actual block
2156  * de-allocation (i.e., freemap and free space count manipulation) until
2157  * after the updated versions reach stable storage.  After the disk is
2158  * updated, the blocks can be safely de-allocated whenever it is convenient.
2159  * This implementation handles only the common case of reducing a file's
2160  * length to zero. Other cases are handled by the conventional synchronous
2161  * write approach.
2162  *
2163  * The ffs implementation with which we worked double-checks
2164  * the state of the block pointers and file size as it reduces
2165  * a file's length.  Some of this code is replicated here in our
2166  * soft updates implementation.  The freeblks->fb_chkcnt field is
2167  * used to transfer a part of this information to the procedure
2168  * that eventually de-allocates the blocks.
2169  *
2170  * This routine should be called from the routine that shortens
2171  * a file's length, before the inode's size or block pointers
2172  * are modified. It will save the block pointer information for
2173  * later release and zero the inode so that the calling routine
2174  * can release it.
2175  */
2176 void
2177 softdep_setup_freeblocks(ip, length, flags)
2178         struct inode *ip;       /* The inode whose length is to be reduced */
2179         off_t length;           /* The new length for the file */
2180         int flags;              /* IO_EXT and/or IO_NORMAL */
2181 {
2182         struct freeblks *freeblks;
2183         struct inodedep *inodedep;
2184         struct allocdirect *adp;
2185         struct vnode *vp;
2186         struct buf *bp;
2187         struct fs *fs;
2188         ufs2_daddr_t extblocks, datablocks;
2189         struct mount *mp;
2190         int i, delay, error;
2191
2192         fs = ip->i_fs;
2193         mp = UFSTOVFS(ip->i_ump);
2194         if (length != 0)
2195                 panic("softdep_setup_freeblocks: non-zero length");
2196         MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
2197                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
2198         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
2199         freeblks->fb_state = ATTACHED;
2200         freeblks->fb_uid = ip->i_uid;
2201         freeblks->fb_previousinum = ip->i_number;
2202         freeblks->fb_devvp = ip->i_devvp;
2203         extblocks = 0;
2204         if (fs->fs_magic == FS_UFS2_MAGIC)
2205                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
2206         datablocks = DIP(ip, i_blocks) - extblocks;
2207         if ((flags & IO_NORMAL) == 0) {
2208                 freeblks->fb_oldsize = 0;
2209                 freeblks->fb_chkcnt = 0;
2210         } else {
2211                 freeblks->fb_oldsize = ip->i_size;
2212                 ip->i_size = 0;
2213                 DIP_SET(ip, i_size, 0);
2214                 freeblks->fb_chkcnt = datablocks;
2215                 for (i = 0; i < NDADDR; i++) {
2216                         freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
2217                         DIP_SET(ip, i_db[i], 0);
2218                 }
2219                 for (i = 0; i < NIADDR; i++) {
2220                         freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
2221                         DIP_SET(ip, i_ib[i], 0);
2222                 }
2223                 /*
2224                  * If the file was removed, then the space being freed was
2225                  * accounted for then (see softdep_releasefile()). If the
2226                  * file is merely being truncated, then we account for it now.
2227                  */
2228                 if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2229                         UFS_LOCK(ip->i_ump);
2230                         fs->fs_pendingblocks += datablocks;
2231                         UFS_UNLOCK(ip->i_ump);
2232                 }
2233         }
2234         if ((flags & IO_EXT) == 0) {
2235                 freeblks->fb_oldextsize = 0;
2236         } else {
2237                 freeblks->fb_oldextsize = ip->i_din2->di_extsize;
2238                 ip->i_din2->di_extsize = 0;
2239                 freeblks->fb_chkcnt += extblocks;
2240                 for (i = 0; i < NXADDR; i++) {
2241                         freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
2242                         ip->i_din2->di_extb[i] = 0;
2243                 }
2244         }
2245         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
2246         /*
2247          * Push the zero'ed inode to to its disk buffer so that we are free
2248          * to delete its dependencies below. Once the dependencies are gone
2249          * the buffer can be safely released.
2250          */
2251         if ((error = bread(ip->i_devvp,
2252             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
2253             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
2254                 brelse(bp);
2255                 softdep_error("softdep_setup_freeblocks", error);
2256         }
2257         if (ip->i_ump->um_fstype == UFS1)
2258                 *((struct ufs1_dinode *)bp->b_data +
2259                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
2260         else
2261                 *((struct ufs2_dinode *)bp->b_data +
2262                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
2263         /*
2264          * Find and eliminate any inode dependencies.
2265          */
2266         ACQUIRE_LOCK(&lk);
2267         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
2268         if ((inodedep->id_state & IOSTARTED) != 0)
2269                 panic("softdep_setup_freeblocks: inode busy");
2270         /*
2271          * Add the freeblks structure to the list of operations that
2272          * must await the zero'ed inode being written to disk. If we
2273          * still have a bitmap dependency (delay == 0), then the inode
2274          * has never been written to disk, so we can process the
2275          * freeblks below once we have deleted the dependencies.
2276          */
2277         delay = (inodedep->id_state & DEPCOMPLETE);
2278         if (delay)
2279                 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
2280         /*
2281          * Because the file length has been truncated to zero, any
2282          * pending block allocation dependency structures associated
2283          * with this inode are obsolete and can simply be de-allocated.
2284          * We must first merge the two dependency lists to get rid of
2285          * any duplicate freefrag structures, then purge the merged list.
2286          * If we still have a bitmap dependency, then the inode has never
2287          * been written to disk, so we can free any fragments without delay.
2288          */
2289         if (flags & IO_NORMAL) {
2290                 merge_inode_lists(&inodedep->id_newinoupdt,
2291                     &inodedep->id_inoupdt);
2292                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2293                         free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2294         }
2295         if (flags & IO_EXT) {
2296                 merge_inode_lists(&inodedep->id_newextupdt,
2297                     &inodedep->id_extupdt);
2298                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
2299                         free_allocdirect(&inodedep->id_extupdt, adp, delay);
2300         }
2301         FREE_LOCK(&lk);
2302         bdwrite(bp);
2303         /*
2304          * We must wait for any I/O in progress to finish so that
2305          * all potential buffers on the dirty list will be visible.
2306          * Once they are all there, walk the list and get rid of
2307          * any dependencies.
2308          */
2309         vp = ITOV(ip);
2310         VI_LOCK(vp);
2311         drain_output(vp);
2312 restart:
2313         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
2314                 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
2315                     ((flags & IO_NORMAL) == 0 &&
2316                       (bp->b_xflags & BX_ALTDATA) == 0))
2317                         continue;
2318                 if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
2319                         goto restart;
2320                 VI_UNLOCK(vp);
2321                 ACQUIRE_LOCK(&lk);
2322                 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
2323                 deallocate_dependencies(bp, inodedep);
2324                 FREE_LOCK(&lk);
2325                 bp->b_flags |= B_INVAL | B_NOCACHE;
2326                 brelse(bp);
2327                 VI_LOCK(vp);
2328                 goto restart;
2329         }
2330         VI_UNLOCK(vp);
2331         ACQUIRE_LOCK(&lk);
2332         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
2333                 (void) free_inodedep(inodedep);
2334
2335         if(delay) {
2336                 freeblks->fb_state |= DEPCOMPLETE;
2337                 /*
2338                  * If the inode with zeroed block pointers is now on disk
2339                  * we can start freeing blocks. Add freeblks to the worklist
2340                  * instead of calling  handle_workitem_freeblocks directly as
2341                  * it is more likely that additional IO is needed to complete
2342                  * the request here than in the !delay case.
2343                  */
2344                 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2345                         add_to_worklist(&freeblks->fb_list);
2346         }
2347
2348         FREE_LOCK(&lk);
2349         /*
2350          * If the inode has never been written to disk (delay == 0),
2351          * then we can process the freeblks now that we have deleted
2352          * the dependencies.
2353          */
2354         if (!delay)
2355                 handle_workitem_freeblocks(freeblks, 0);
2356 }
2357
2358 /*
2359  * Reclaim any dependency structures from a buffer that is about to
2360  * be reallocated to a new vnode. The buffer must be locked, thus,
2361  * no I/O completion operations can occur while we are manipulating
2362  * its associated dependencies. The mutex is held so that other I/O's
2363  * associated with related dependencies do not occur.
2364  */
2365 static void
2366 deallocate_dependencies(bp, inodedep)
2367         struct buf *bp;
2368         struct inodedep *inodedep;
2369 {
2370         struct worklist *wk;
2371         struct indirdep *indirdep;
2372         struct allocindir *aip;
2373         struct pagedep *pagedep;
2374         struct dirrem *dirrem;
2375         struct diradd *dap;
2376         int i;
2377
2378         mtx_assert(&lk, MA_OWNED);
2379         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2380                 switch (wk->wk_type) {
2381
2382                 case D_INDIRDEP:
2383                         indirdep = WK_INDIRDEP(wk);
2384                         /*
2385                          * None of the indirect pointers will ever be visible,
2386                          * so they can simply be tossed. GOINGAWAY ensures
2387                          * that allocated pointers will be saved in the buffer
2388                          * cache until they are freed. Note that they will
2389                          * only be able to be found by their physical address
2390                          * since the inode mapping the logical address will
2391                          * be gone. The save buffer used for the safe copy
2392                          * was allocated in setup_allocindir_phase2 using
2393                          * the physical address so it could be used for this
2394                          * purpose. Hence we swap the safe copy with the real
2395                          * copy, allowing the safe copy to be freed and holding
2396                          * on to the real copy for later use in indir_trunc.
2397                          */
2398                         if (indirdep->ir_state & GOINGAWAY)
2399                                 panic("deallocate_dependencies: already gone");
2400                         indirdep->ir_state |= GOINGAWAY;
2401                         VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
2402                         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2403                                 free_allocindir(aip, inodedep);
2404                         if (bp->b_lblkno >= 0 ||
2405                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
2406                                 panic("deallocate_dependencies: not indir");
2407                         bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2408                             bp->b_bcount);
2409                         WORKLIST_REMOVE(wk);
2410                         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2411                         continue;
2412
2413                 case D_PAGEDEP:
2414                         pagedep = WK_PAGEDEP(wk);
2415                         /*
2416                          * None of the directory additions will ever be
2417                          * visible, so they can simply be tossed.
2418                          */
2419                         for (i = 0; i < DAHASHSZ; i++)
2420                                 while ((dap =
2421                                     LIST_FIRST(&pagedep->pd_diraddhd[i])))
2422                                         free_diradd(dap);
2423                         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2424                                 free_diradd(dap);
2425                         /*
2426                          * Copy any directory remove dependencies to the list
2427                          * to be processed after the zero'ed inode is written.
2428                          * If the inode has already been written, then they
2429                          * can be dumped directly onto the work list.
2430                          */
2431                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
2432                                 LIST_REMOVE(dirrem, dm_next);
2433                                 dirrem->dm_dirinum = pagedep->pd_ino;
2434                                 if (inodedep == NULL ||
2435                                     (inodedep->id_state & ALLCOMPLETE) ==
2436                                      ALLCOMPLETE)
2437                                         add_to_worklist(&dirrem->dm_list);
2438                                 else
2439                                         WORKLIST_INSERT(&inodedep->id_bufwait,
2440                                             &dirrem->dm_list);
2441                         }
2442                         if ((pagedep->pd_state & NEWBLOCK) != 0) {
2443                                 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2444                                         if (wk->wk_type == D_NEWDIRBLK &&
2445                                             WK_NEWDIRBLK(wk)->db_pagedep ==
2446                                               pagedep)
2447                                                 break;
2448                                 if (wk != NULL) {
2449                                         WORKLIST_REMOVE(wk);
2450                                         free_newdirblk(WK_NEWDIRBLK(wk));
2451                                 } else
2452                                         panic("deallocate_dependencies: "
2453                                               "lost pagedep");
2454                         }
2455                         WORKLIST_REMOVE(&pagedep->pd_list);
2456                         LIST_REMOVE(pagedep, pd_hash);
2457                         WORKITEM_FREE(pagedep, D_PAGEDEP);
2458                         continue;
2459
2460                 case D_ALLOCINDIR:
2461                         free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2462                         continue;
2463
2464                 case D_ALLOCDIRECT:
2465                 case D_INODEDEP:
2466                         panic("deallocate_dependencies: Unexpected type %s",
2467                             TYPENAME(wk->wk_type));
2468                         /* NOTREACHED */
2469
2470                 default:
2471                         panic("deallocate_dependencies: Unknown type %s",
2472                             TYPENAME(wk->wk_type));
2473                         /* NOTREACHED */
2474                 }
2475         }
2476 }
2477
2478 /*
2479  * Free an allocdirect. Generate a new freefrag work request if appropriate.
2480  * This routine must be called with splbio interrupts blocked.
2481  */
2482 static void
2483 free_allocdirect(adphead, adp, delay)
2484         struct allocdirectlst *adphead;
2485         struct allocdirect *adp;
2486         int delay;
2487 {
2488         struct newdirblk *newdirblk;
2489         struct worklist *wk;
2490
2491         mtx_assert(&lk, MA_OWNED);
2492         if ((adp->ad_state & DEPCOMPLETE) == 0)
2493                 LIST_REMOVE(adp, ad_deps);
2494         TAILQ_REMOVE(adphead, adp, ad_next);
2495         if ((adp->ad_state & COMPLETE) == 0)
2496                 WORKLIST_REMOVE(&adp->ad_list);
2497         if (adp->ad_freefrag != NULL) {
2498                 if (delay)
2499                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2500                             &adp->ad_freefrag->ff_list);
2501                 else
2502                         add_to_worklist(&adp->ad_freefrag->ff_list);
2503         }
2504         if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2505                 newdirblk = WK_NEWDIRBLK(wk);
2506                 WORKLIST_REMOVE(&newdirblk->db_list);
2507                 if (!LIST_EMPTY(&adp->ad_newdirblk))
2508                         panic("free_allocdirect: extra newdirblk");
2509                 if (delay)
2510                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2511                             &newdirblk->db_list);
2512                 else
2513                         free_newdirblk(newdirblk);
2514         }
2515         WORKITEM_FREE(adp, D_ALLOCDIRECT);
2516 }
2517
2518 /*
2519  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2520  * This routine must be called with splbio interrupts blocked.
2521  */
2522 static void
2523 free_newdirblk(newdirblk)
2524         struct newdirblk *newdirblk;
2525 {
2526         struct pagedep *pagedep;
2527         struct diradd *dap;
2528         int i;
2529
2530         mtx_assert(&lk, MA_OWNED);
2531         /*
2532          * If the pagedep is still linked onto the directory buffer
2533          * dependency chain, then some of the entries on the
2534          * pd_pendinghd list may not be committed to disk yet. In
2535          * this case, we will simply clear the NEWBLOCK flag and
2536          * let the pd_pendinghd list be processed when the pagedep
2537          * is next written. If the pagedep is no longer on the buffer
2538          * dependency chain, then all the entries on the pd_pending
2539          * list are committed to disk and we can free them here.
2540          */
2541         pagedep = newdirblk->db_pagedep;
2542         pagedep->pd_state &= ~NEWBLOCK;
2543         if ((pagedep->pd_state & ONWORKLIST) == 0)
2544                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2545                         free_diradd(dap);
2546         /*
2547          * If no dependencies remain, the pagedep will be freed.
2548          */
2549         for (i = 0; i < DAHASHSZ; i++)
2550                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
2551                         break;
2552         if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2553                 LIST_REMOVE(pagedep, pd_hash);
2554                 WORKITEM_FREE(pagedep, D_PAGEDEP);
2555         }
2556         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2557 }
2558
2559 /*
2560  * Prepare an inode to be freed. The actual free operation is not
2561  * done until the zero'ed inode has been written to disk.
2562  */
2563 void
2564 softdep_freefile(pvp, ino, mode)
2565         struct vnode *pvp;
2566         ino_t ino;
2567         int mode;
2568 {
2569         struct inode *ip = VTOI(pvp);
2570         struct inodedep *inodedep;
2571         struct freefile *freefile;
2572
2573         /*
2574          * This sets up the inode de-allocation dependency.
2575          */
2576         MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2577                 M_FREEFILE, M_SOFTDEP_FLAGS);
2578         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
2579         freefile->fx_mode = mode;
2580         freefile->fx_oldinum = ino;
2581         freefile->fx_devvp = ip->i_devvp;
2582         if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2583                 UFS_LOCK(ip->i_ump);
2584                 ip->i_fs->fs_pendinginodes += 1;
2585                 UFS_UNLOCK(ip->i_ump);
2586         }
2587
2588         /*
2589          * If the inodedep does not exist, then the zero'ed inode has
2590          * been written to disk. If the allocated inode has never been
2591          * written to disk, then the on-disk inode is zero'ed. In either
2592          * case we can free the file immediately.
2593          */
2594         ACQUIRE_LOCK(&lk);
2595         if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
2596             check_inode_unwritten(inodedep)) {
2597                 FREE_LOCK(&lk);
2598                 handle_workitem_freefile(freefile);
2599                 return;
2600         }
2601         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2602         FREE_LOCK(&lk);
2603         ip->i_flag |= IN_MODIFIED;
2604 }
2605
2606 /*
2607  * Check to see if an inode has never been written to disk. If
2608  * so free the inodedep and return success, otherwise return failure.
2609  * This routine must be called with splbio interrupts blocked.
2610  *
2611  * If we still have a bitmap dependency, then the inode has never
2612  * been written to disk. Drop the dependency as it is no longer
2613  * necessary since the inode is being deallocated. We set the
2614  * ALLCOMPLETE flags since the bitmap now properly shows that the
2615  * inode is not allocated. Even if the inode is actively being
2616  * written, it has been rolled back to its zero'ed state, so we
2617  * are ensured that a zero inode is what is on the disk. For short
2618  * lived files, this change will usually result in removing all the
2619  * dependencies from the inode so that it can be freed immediately.
2620  */
2621 static int
2622 check_inode_unwritten(inodedep)
2623         struct inodedep *inodedep;
2624 {
2625
2626         mtx_assert(&lk, MA_OWNED);
2627         if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2628             !LIST_EMPTY(&inodedep->id_pendinghd) ||
2629             !LIST_EMPTY(&inodedep->id_bufwait) ||
2630             !LIST_EMPTY(&inodedep->id_inowait) ||
2631             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2632             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2633             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
2634             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2635             inodedep->id_nlinkdelta != 0)
2636                 return (0);
2637
2638         /*
2639          * Another process might be in initiate_write_inodeblock_ufs[12]
2640          * trying to allocate memory without holding "Softdep Lock".
2641          */
2642         if ((inodedep->id_state & IOSTARTED) != 0 &&
2643             inodedep->id_savedino1 == NULL)
2644                 return (0);
2645
2646         inodedep->id_state |= ALLCOMPLETE;
2647         LIST_REMOVE(inodedep, id_deps);
2648         inodedep->id_buf = NULL;
2649         if (inodedep->id_state & ONWORKLIST)
2650                 WORKLIST_REMOVE(&inodedep->id_list);
2651         if (inodedep->id_savedino1 != NULL) {
2652                 FREE(inodedep->id_savedino1, M_SAVEDINO);
2653                 inodedep->id_savedino1 = NULL;
2654         }
2655         if (free_inodedep(inodedep) == 0)
2656                 panic("check_inode_unwritten: busy inode");
2657         return (1);
2658 }
2659
2660 /*
2661  * Try to free an inodedep structure. Return 1 if it could be freed.
2662  */
2663 static int
2664 free_inodedep(inodedep)
2665         struct inodedep *inodedep;
2666 {
2667
2668         mtx_assert(&lk, MA_OWNED);
2669         if ((inodedep->id_state & ONWORKLIST) != 0 ||
2670             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2671             !LIST_EMPTY(&inodedep->id_pendinghd) ||
2672             !LIST_EMPTY(&inodedep->id_bufwait) ||
2673             !LIST_EMPTY(&inodedep->id_inowait) ||
2674             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2675             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2676             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
2677             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2678             inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2679                 return (0);
2680         LIST_REMOVE(inodedep, id_hash);
2681         WORKITEM_FREE(inodedep, D_INODEDEP);
2682         num_inodedep -= 1;
2683         return (1);
2684 }
2685
2686 /*
2687  * This workitem routine performs the block de-allocation.
2688  * The workitem is added to the pending list after the updated
2689  * inode block has been written to disk.  As mentioned above,
2690  * checks regarding the number of blocks de-allocated (compared
2691  * to the number of blocks allocated for the file) are also
2692  * performed in this function.
2693  */
2694 static void
2695 handle_workitem_freeblocks(freeblks, flags)
2696         struct freeblks *freeblks;
2697         int flags;
2698 {
2699         struct inode *ip;
2700         struct vnode *vp;
2701         struct fs *fs;
2702         struct ufsmount *ump;
2703         int i, nblocks, level, bsize;
2704         ufs2_daddr_t bn, blocksreleased = 0;
2705         int error, allerror = 0;
2706         ufs_lbn_t baselbns[NIADDR], tmpval;
2707         int fs_pendingblocks;
2708
2709         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2710         fs = ump->um_fs;
2711         fs_pendingblocks = 0;
2712         tmpval = 1;
2713         baselbns[0] = NDADDR;
2714         for (i = 1; i < NIADDR; i++) {
2715                 tmpval *= NINDIR(fs);
2716                 baselbns[i] = baselbns[i - 1] + tmpval;
2717         }
2718         nblocks = btodb(fs->fs_bsize);
2719         blocksreleased = 0;
2720         /*
2721          * Release all extended attribute blocks or frags.
2722          */
2723         if (freeblks->fb_oldextsize > 0) {
2724                 for (i = (NXADDR - 1); i >= 0; i--) {
2725                         if ((bn = freeblks->fb_eblks[i]) == 0)
2726                                 continue;
2727                         bsize = sblksize(fs, freeblks->fb_oldextsize, i);
2728                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2729                             freeblks->fb_previousinum);
2730                         blocksreleased += btodb(bsize);
2731                 }
2732         }
2733         /*
2734          * Release all data blocks or frags.
2735          */
2736         if (freeblks->fb_oldsize > 0) {
2737                 /*
2738                  * Indirect blocks first.
2739                  */
2740                 for (level = (NIADDR - 1); level >= 0; level--) {
2741                         if ((bn = freeblks->fb_iblks[level]) == 0)
2742                                 continue;
2743                         if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
2744                             level, baselbns[level], &blocksreleased)) != 0)
2745                                 allerror = error;
2746                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
2747                             fs->fs_bsize, freeblks->fb_previousinum);
2748                         fs_pendingblocks += nblocks;
2749                         blocksreleased += nblocks;
2750                 }
2751                 /*
2752                  * All direct blocks or frags.
2753                  */
2754                 for (i = (NDADDR - 1); i >= 0; i--) {
2755                         if ((bn = freeblks->fb_dblks[i]) == 0)
2756                                 continue;
2757                         bsize = sblksize(fs, freeblks->fb_oldsize, i);
2758                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2759                             freeblks->fb_previousinum);
2760                         fs_pendingblocks += btodb(bsize);
2761                         blocksreleased += btodb(bsize);
2762                 }
2763         }
2764         UFS_LOCK(ump);
2765         fs->fs_pendingblocks -= fs_pendingblocks;
2766         UFS_UNLOCK(ump);
2767         /*
2768          * If we still have not finished background cleanup, then check
2769          * to see if the block count needs to be adjusted.
2770          */
2771         if (freeblks->fb_chkcnt != blocksreleased &&
2772             (fs->fs_flags & FS_UNCLEAN) != 0 &&
2773             ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
2774             (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
2775                 ip = VTOI(vp);
2776                 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
2777                     freeblks->fb_chkcnt - blocksreleased);
2778                 ip->i_flag |= IN_CHANGE;
2779                 vput(vp);
2780         }
2781
2782 #ifdef INVARIANTS
2783         if (freeblks->fb_chkcnt != blocksreleased &&
2784             ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2785                 printf("handle_workitem_freeblocks: block count\n");
2786         if (allerror)
2787                 softdep_error("handle_workitem_freeblks", allerror);
2788 #endif /* INVARIANTS */
2789
2790         ACQUIRE_LOCK(&lk);
2791         WORKITEM_FREE(freeblks, D_FREEBLKS);
2792         FREE_LOCK(&lk);
2793 }
2794
2795 /*
2796  * Release blocks associated with the inode ip and stored in the indirect
2797  * block dbn. If level is greater than SINGLE, the block is an indirect block
2798  * and recursive calls to indirtrunc must be used to cleanse other indirect
2799  * blocks.
2800  */
2801 static int
2802 indir_trunc(freeblks, dbn, level, lbn, countp)
2803         struct freeblks *freeblks;
2804         ufs2_daddr_t dbn;
2805         int level;
2806         ufs_lbn_t lbn;
2807         ufs2_daddr_t *countp;
2808 {
2809         struct buf *bp;
2810         struct fs *fs;
2811         struct worklist *wk;
2812         struct indirdep *indirdep;
2813         struct ufsmount *ump;
2814         ufs1_daddr_t *bap1 = 0;
2815         ufs2_daddr_t nb, *bap2 = 0;
2816         ufs_lbn_t lbnadd;
2817         int i, nblocks, ufs1fmt;
2818         int error, allerror = 0;
2819         int fs_pendingblocks;
2820
2821         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2822         fs = ump->um_fs;
2823         fs_pendingblocks = 0;
2824         lbnadd = 1;
2825         for (i = level; i > 0; i--)
2826                 lbnadd *= NINDIR(fs);
2827         /*
2828          * Get buffer of block pointers to be freed. This routine is not
2829          * called until the zero'ed inode has been written, so it is safe
2830          * to free blocks as they are encountered. Because the inode has
2831          * been zero'ed, calls to bmap on these blocks will fail. So, we
2832          * have to use the on-disk address and the block device for the
2833          * filesystem to look them up. If the file was deleted before its
2834          * indirect blocks were all written to disk, the routine that set
2835          * us up (deallocate_dependencies) will have arranged to leave
2836          * a complete copy of the indirect block in memory for our use.
2837          * Otherwise we have to read the blocks in from the disk.
2838          */
2839 #ifdef notyet
2840         bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
2841             GB_NOCREAT);
2842 #else
2843         bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
2844 #endif
2845         ACQUIRE_LOCK(&lk);
2846         if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2847                 if (wk->wk_type != D_INDIRDEP ||
2848                     (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2849                     (indirdep->ir_state & GOINGAWAY) == 0)
2850                         panic("indir_trunc: lost indirdep");
2851                 WORKLIST_REMOVE(wk);
2852                 WORKITEM_FREE(indirdep, D_INDIRDEP);
2853                 if (!LIST_EMPTY(&bp->b_dep))
2854                         panic("indir_trunc: dangling dep");
2855                 ump->um_numindirdeps -= 1;
2856                 FREE_LOCK(&lk);
2857         } else {
2858 #ifdef notyet
2859                 if (bp)
2860                         brelse(bp);
2861 #endif
2862                 FREE_LOCK(&lk);
2863                 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2864                     NOCRED, &bp);
2865                 if (error) {
2866                         brelse(bp);
2867                         return (error);
2868                 }
2869         }
2870         /*
2871          * Recursively free indirect blocks.
2872          */
2873         if (ump->um_fstype == UFS1) {
2874                 ufs1fmt = 1;
2875                 bap1 = (ufs1_daddr_t *)bp->b_data;
2876         } else {
2877                 ufs1fmt = 0;
2878                 bap2 = (ufs2_daddr_t *)bp->b_data;
2879         }
2880         nblocks = btodb(fs->fs_bsize);
2881         for (i = NINDIR(fs) - 1; i >= 0; i--) {
2882                 if (ufs1fmt)
2883                         nb = bap1[i];
2884                 else
2885                         nb = bap2[i];
2886                 if (nb == 0)
2887                         continue;
2888                 if (level != 0) {
2889                         if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2890                              level - 1, lbn + (i * lbnadd), countp)) != 0)
2891                                 allerror = error;
2892                 }
2893                 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2894                     freeblks->fb_previousinum);
2895                 fs_pendingblocks += nblocks;
2896                 *countp += nblocks;
2897         }
2898         UFS_LOCK(ump);
2899         fs->fs_pendingblocks -= fs_pendingblocks;
2900         UFS_UNLOCK(ump);
2901         bp->b_flags |= B_INVAL | B_NOCACHE;
2902         brelse(bp);
2903         return (allerror);
2904 }
2905
2906 /*
2907  * Free an allocindir.
2908  * This routine must be called with splbio interrupts blocked.
2909  */
2910 static void
2911 free_allocindir(aip, inodedep)
2912         struct allocindir *aip;
2913         struct inodedep *inodedep;
2914 {
2915         struct freefrag *freefrag;
2916
2917         mtx_assert(&lk, MA_OWNED);
2918         if ((aip->ai_state & DEPCOMPLETE) == 0)
2919                 LIST_REMOVE(aip, ai_deps);
2920         if (aip->ai_state & ONWORKLIST)
2921                 WORKLIST_REMOVE(&aip->ai_list);
2922         LIST_REMOVE(aip, ai_next);
2923         if ((freefrag = aip->ai_freefrag) != NULL) {
2924                 if (inodedep == NULL)
2925                         add_to_worklist(&freefrag->ff_list);
2926                 else
2927                         WORKLIST_INSERT(&inodedep->id_bufwait,
2928                             &freefrag->ff_list);
2929         }
2930         WORKITEM_FREE(aip, D_ALLOCINDIR);
2931 }
2932
2933 /*
2934  * Directory entry addition dependencies.
2935  *
2936  * When adding a new directory entry, the inode (with its incremented link
2937  * count) must be written to disk before the directory entry's pointer to it.
2938  * Also, if the inode is newly allocated, the corresponding freemap must be
2939  * updated (on disk) before the directory entry's pointer. These requirements
2940  * are met via undo/redo on the directory entry's pointer, which consists
2941  * simply of the inode number.
2942  *
2943  * As directory entries are added and deleted, the free space within a
2944  * directory block can become fragmented.  The ufs filesystem will compact
2945  * a fragmented directory block to make space for a new entry. When this
2946  * occurs, the offsets of previously added entries change. Any "diradd"
2947  * dependency structures corresponding to these entries must be updated with
2948  * the new offsets.
2949  */
2950
2951 /*
2952  * This routine is called after the in-memory inode's link
2953  * count has been incremented, but before the directory entry's
2954  * pointer to the inode has been set.
2955  */
2956 int
2957 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2958         struct buf *bp;         /* buffer containing directory block */
2959         struct inode *dp;       /* inode for directory */
2960         off_t diroffset;        /* offset of new entry in directory */
2961         ino_t newinum;          /* inode referenced by new directory entry */
2962         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
2963         int isnewblk;           /* entry is in a newly allocated block */
2964 {
2965         int offset;             /* offset of new entry within directory block */
2966         ufs_lbn_t lbn;          /* block in directory containing new entry */
2967         struct fs *fs;
2968         struct diradd *dap;
2969         struct allocdirect *adp;
2970         struct pagedep *pagedep;
2971         struct inodedep *inodedep;
2972         struct newdirblk *newdirblk = 0;
2973         struct mkdir *mkdir1, *mkdir2;
2974         struct mount *mp;
2975
2976         /*
2977          * Whiteouts have no dependencies.
2978          */
2979         if (newinum == WINO) {
2980                 if (newdirbp != NULL)
2981                         bdwrite(newdirbp);
2982                 return (0);
2983         }
2984         mp = UFSTOVFS(dp->i_ump);
2985         fs = dp->i_fs;
2986         lbn = lblkno(fs, diroffset);
2987         offset = blkoff(fs, diroffset);
2988         MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2989                 M_SOFTDEP_FLAGS|M_ZERO);
2990         workitem_alloc(&dap->da_list, D_DIRADD, mp);
2991         dap->da_offset = offset;
2992         dap->da_newinum = newinum;
2993         dap->da_state = ATTACHED;
2994         if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2995                 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
2996                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
2997                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
2998         }
2999         if (newdirbp == NULL) {
3000                 dap->da_state |= DEPCOMPLETE;
3001                 ACQUIRE_LOCK(&lk);
3002         } else {
3003                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
3004                 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3005                     M_SOFTDEP_FLAGS);
3006                 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
3007                 mkdir1->md_state = MKDIR_BODY;
3008                 mkdir1->md_diradd = dap;
3009                 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3010                     M_SOFTDEP_FLAGS);
3011                 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
3012                 mkdir2->md_state = MKDIR_PARENT;
3013                 mkdir2->md_diradd = dap;
3014                 /*
3015                  * Dependency on "." and ".." being written to disk.
3016                  */
3017                 mkdir1->md_buf = newdirbp;
3018                 ACQUIRE_LOCK(&lk);
3019                 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
3020                 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
3021                 FREE_LOCK(&lk);
3022                 bdwrite(newdirbp);
3023                 /*
3024                  * Dependency on link count increase for parent directory
3025                  */
3026                 ACQUIRE_LOCK(&lk);
3027                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
3028                     || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3029                         dap->da_state &= ~MKDIR_PARENT;
3030                         WORKITEM_FREE(mkdir2, D_MKDIR);
3031                 } else {
3032                         LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
3033                         WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
3034                 }
3035         }
3036         /*
3037          * Link into parent directory pagedep to await its being written.
3038          */
3039         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3040                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3041         dap->da_pagedep = pagedep;
3042         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
3043             da_pdlist);
3044         /*
3045          * Link into its inodedep. Put it on the id_bufwait list if the inode
3046          * is not yet written. If it is written, do the post-inode write
3047          * processing to put it on the id_pendinghd list.
3048          */
3049         (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
3050         if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
3051                 diradd_inode_written(dap, inodedep);
3052         else
3053                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3054         if (isnewblk) {
3055                 /*
3056                  * Directories growing into indirect blocks are rare
3057                  * enough and the frequency of new block allocation
3058                  * in those cases even more rare, that we choose not
3059                  * to bother tracking them. Rather we simply force the
3060                  * new directory entry to disk.
3061                  */
3062                 if (lbn >= NDADDR) {
3063                         FREE_LOCK(&lk);
3064                         /*
3065                          * We only have a new allocation when at the
3066                          * beginning of a new block, not when we are
3067                          * expanding into an existing block.
3068                          */
3069                         if (blkoff(fs, diroffset) == 0)
3070                                 return (1);
3071                         return (0);
3072                 }
3073                 /*
3074                  * We only have a new allocation when at the beginning
3075                  * of a new fragment, not when we are expanding into an
3076                  * existing fragment. Also, there is nothing to do if we
3077                  * are already tracking this block.
3078                  */
3079                 if (fragoff(fs, diroffset) != 0) {
3080                         FREE_LOCK(&lk);
3081                         return (0);
3082                 }
3083                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
3084                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
3085                         FREE_LOCK(&lk);
3086                         return (0);
3087                 }
3088                 /*
3089                  * Find our associated allocdirect and have it track us.
3090                  */
3091                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
3092                         panic("softdep_setup_directory_add: lost inodedep");
3093                 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
3094                 if (adp == NULL || adp->ad_lbn != lbn)
3095                         panic("softdep_setup_directory_add: lost entry");
3096                 pagedep->pd_state |= NEWBLOCK;
3097                 newdirblk->db_pagedep = pagedep;
3098                 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
3099         }
3100         FREE_LOCK(&lk);
3101         return (0);
3102 }
3103
3104 /*
3105  * This procedure is called to change the offset of a directory
3106  * entry when compacting a directory block which must be owned
3107  * exclusively by the caller. Note that the actual entry movement
3108  * must be done in this procedure to ensure that no I/O completions
3109  * occur while the move is in progress.
3110  */
3111 void
3112 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
3113         struct inode *dp;       /* inode for directory */
3114         caddr_t base;           /* address of dp->i_offset */
3115         caddr_t oldloc;         /* address of old directory location */
3116         caddr_t newloc;         /* address of new directory location */
3117         int entrysize;          /* size of directory entry */
3118 {
3119         int offset, oldoffset, newoffset;
3120         struct pagedep *pagedep;
3121         struct diradd *dap;
3122         ufs_lbn_t lbn;
3123
3124         ACQUIRE_LOCK(&lk);
3125         lbn = lblkno(dp->i_fs, dp->i_offset);
3126         offset = blkoff(dp->i_fs, dp->i_offset);
3127         if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
3128                 goto done;
3129         oldoffset = offset + (oldloc - base);
3130         newoffset = offset + (newloc - base);
3131
3132         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
3133                 if (dap->da_offset != oldoffset)
3134                         continue;
3135                 dap->da_offset = newoffset;
3136                 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
3137                         break;
3138                 LIST_REMOVE(dap, da_pdlist);
3139                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
3140                     dap, da_pdlist);
3141                 break;
3142         }
3143         if (dap == NULL) {
3144
3145                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
3146                         if (dap->da_offset == oldoffset) {
3147                                 dap->da_offset = newoffset;
3148                                 break;
3149                         }
3150                 }
3151         }
3152 done:
3153         bcopy(oldloc, newloc, entrysize);
3154         FREE_LOCK(&lk);
3155 }
3156
3157 /*
3158  * Free a diradd dependency structure. This routine must be called
3159  * with splbio interrupts blocked.
3160  */
3161 static void
3162 free_diradd(dap)
3163         struct diradd *dap;
3164 {
3165         struct dirrem *dirrem;
3166         struct pagedep *pagedep;
3167         struct inodedep *inodedep;
3168         struct mkdir *mkdir, *nextmd;
3169
3170         mtx_assert(&lk, MA_OWNED);
3171         WORKLIST_REMOVE(&dap->da_list);
3172         LIST_REMOVE(dap, da_pdlist);
3173         if ((dap->da_state & DIRCHG) == 0) {
3174                 pagedep = dap->da_pagedep;
3175         } else {
3176                 dirrem = dap->da_previous;
3177                 pagedep = dirrem->dm_pagedep;
3178                 dirrem->dm_dirinum = pagedep->pd_ino;
3179                 add_to_worklist(&dirrem->dm_list);
3180         }
3181         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
3182             0, &inodedep) != 0)
3183                 (void) free_inodedep(inodedep);
3184         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
3185                 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
3186                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
3187                         if (mkdir->md_diradd != dap)
3188                                 continue;
3189                         dap->da_state &= ~mkdir->md_state;
3190                         WORKLIST_REMOVE(&mkdir->md_list);
3191                         LIST_REMOVE(mkdir, md_mkdirs);
3192                         WORKITEM_FREE(mkdir, D_MKDIR);
3193                 }
3194                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
3195                         panic("free_diradd: unfound ref");
3196         }
3197         WORKITEM_FREE(dap, D_DIRADD);
3198 }
3199
3200 /*
3201  * Directory entry removal dependencies.
3202  *
3203  * When removing a directory entry, the entry's inode pointer must be
3204  * zero'ed on disk before the corresponding inode's link count is decremented
3205  * (possibly freeing the inode for re-use). This dependency is handled by
3206  * updating the directory entry but delaying the inode count reduction until
3207  * after the directory block has been written to disk. After this point, the
3208  * inode count can be decremented whenever it is convenient.
3209  */
3210
3211 /*
3212  * This routine should be called immediately after removing
3213  * a directory entry.  The inode's link count should not be
3214  * decremented by the calling procedure -- the soft updates
3215  * code will do this task when it is safe.
3216  */
3217 void
3218 softdep_setup_remove(bp, dp, ip, isrmdir)
3219         struct buf *bp;         /* buffer containing directory block */
3220         struct inode *dp;       /* inode for the directory being modified */
3221         struct inode *ip;       /* inode for directory entry being removed */
3222         int isrmdir;            /* indicates if doing RMDIR */
3223 {
3224         struct dirrem *dirrem, *prevdirrem;
3225
3226         /*
3227          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
3228          */
3229         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3230
3231         /*
3232          * If the COMPLETE flag is clear, then there were no active
3233          * entries and we want to roll back to a zeroed entry until
3234          * the new inode is committed to disk. If the COMPLETE flag is
3235          * set then we have deleted an entry that never made it to
3236          * disk. If the entry we deleted resulted from a name change,
3237          * then the old name still resides on disk. We cannot delete
3238          * its inode (returned to us in prevdirrem) until the zeroed
3239          * directory entry gets to disk. The new inode has never been
3240          * referenced on the disk, so can be deleted immediately.
3241          */
3242         if ((dirrem->dm_state & COMPLETE) == 0) {
3243                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
3244                     dm_next);
3245                 FREE_LOCK(&lk);
3246         } else {
3247                 if (prevdirrem != NULL)
3248                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
3249                             prevdirrem, dm_next);
3250                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
3251                 FREE_LOCK(&lk);
3252                 handle_workitem_remove(dirrem, NULL);
3253         }
3254 }
3255
3256 /*
3257  * Allocate a new dirrem if appropriate and return it along with
3258  * its associated pagedep. Called without a lock, returns with lock.
3259  */
3260 static long num_dirrem;         /* number of dirrem allocated */
3261 static struct dirrem *
3262 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
3263         struct buf *bp;         /* buffer containing directory block */
3264         struct inode *dp;       /* inode for the directory being modified */
3265         struct inode *ip;       /* inode for directory entry being removed */
3266         int isrmdir;            /* indicates if doing RMDIR */
3267         struct dirrem **prevdirremp; /* previously referenced inode, if any */
3268 {
3269         int offset;
3270         ufs_lbn_t lbn;
3271         struct diradd *dap;
3272         struct dirrem *dirrem;
3273         struct pagedep *pagedep;
3274
3275         /*
3276          * Whiteouts have no deletion dependencies.
3277          */
3278         if (ip == NULL)
3279                 panic("newdirrem: whiteout");
3280         /*
3281          * If we are over our limit, try to improve the situation.
3282          * Limiting the number of dirrem structures will also limit
3283          * the number of freefile and freeblks structures.
3284          */
3285         ACQUIRE_LOCK(&lk);
3286         if (num_dirrem > max_softdeps / 2)
3287                 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
3288         num_dirrem += 1;
3289         FREE_LOCK(&lk);
3290         MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
3291                 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
3292         workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
3293         dirrem->dm_state = isrmdir ? RMDIR : 0;
3294         dirrem->dm_oldinum = ip->i_number;
3295         *prevdirremp = NULL;
3296
3297         ACQUIRE_LOCK(&lk);
3298         lbn = lblkno(dp->i_fs, dp->i_offset);
3299         offset = blkoff(dp->i_fs, dp->i_offset);
3300         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3301                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3302         dirrem->dm_pagedep = pagedep;
3303         /*
3304          * Check for a diradd dependency for the same directory entry.
3305          * If present, then both dependencies become obsolete and can
3306          * be de-allocated. Check for an entry on both the pd_dirraddhd
3307          * list and the pd_pendinghd list.
3308          */
3309
3310         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
3311                 if (dap->da_offset == offset)
3312                         break;
3313         if (dap == NULL) {
3314
3315                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
3316                         if (dap->da_offset == offset)
3317                                 break;
3318                 if (dap == NULL)
3319                         return (dirrem);
3320         }
3321         /*
3322          * Must be ATTACHED at this point.
3323          */
3324         if ((dap->da_state & ATTACHED) == 0)
3325                 panic("newdirrem: not ATTACHED");
3326         if (dap->da_newinum != ip->i_number)
3327                 panic("newdirrem: inum %d should be %d",
3328                     ip->i_number, dap->da_newinum);
3329         /*
3330          * If we are deleting a changed name that never made it to disk,
3331          * then return the dirrem describing the previous inode (which
3332          * represents the inode currently referenced from this entry on disk).
3333          */
3334         if ((dap->da_state & DIRCHG) != 0) {
3335                 *prevdirremp = dap->da_previous;
3336                 dap->da_state &= ~DIRCHG;
3337                 dap->da_pagedep = pagedep;
3338         }
3339         /*
3340          * We are deleting an entry that never made it to disk.
3341          * Mark it COMPLETE so we can delete its inode immediately.
3342          */
3343         dirrem->dm_state |= COMPLETE;
3344         free_diradd(dap);
3345         return (dirrem);
3346 }
3347
3348 /*
3349  * Directory entry change dependencies.
3350  *
3351  * Changing an existing directory entry requires that an add operation
3352  * be completed first followed by a deletion. The semantics for the addition
3353  * are identical to the description of adding a new entry above except
3354  * that the rollback is to the old inode number rather than zero. Once
3355  * the addition dependency is completed, the removal is done as described
3356  * in the removal routine above.
3357  */
3358
3359 /*
3360  * This routine should be called immediately after changing
3361  * a directory entry.  The inode's link count should not be
3362  * decremented by the calling procedure -- the soft updates
3363  * code will perform this task when it is safe.
3364  */
3365 void
3366 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3367         struct buf *bp;         /* buffer containing directory block */
3368         struct inode *dp;       /* inode for the directory being modified */
3369         struct inode *ip;       /* inode for directory entry being removed */
3370         ino_t newinum;          /* new inode number for changed entry */
3371         int isrmdir;            /* indicates if doing RMDIR */
3372 {
3373         int offset;
3374         struct diradd *dap = NULL;
3375         struct dirrem *dirrem, *prevdirrem;
3376         struct pagedep *pagedep;
3377         struct inodedep *inodedep;
3378         struct mount *mp;
3379
3380         offset = blkoff(dp->i_fs, dp->i_offset);
3381         mp = UFSTOVFS(dp->i_ump);
3382
3383         /*
3384          * Whiteouts do not need diradd dependencies.
3385          */
3386         if (newinum != WINO) {
3387                 MALLOC(dap, struct diradd *, sizeof(struct diradd),
3388                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
3389                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
3390                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3391                 dap->da_offset = offset;
3392                 dap->da_newinum = newinum;
3393         }
3394
3395         /*
3396          * Allocate a new dirrem and ACQUIRE_LOCK.
3397          */
3398         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3399         pagedep = dirrem->dm_pagedep;
3400         /*
3401          * The possible values for isrmdir:
3402          *      0 - non-directory file rename
3403          *      1 - directory rename within same directory
3404          *   inum - directory rename to new directory of given inode number
3405          * When renaming to a new directory, we are both deleting and
3406          * creating a new directory entry, so the link count on the new
3407          * directory should not change. Thus we do not need the followup
3408          * dirrem which is usually done in handle_workitem_remove. We set
3409          * the DIRCHG flag to tell handle_workitem_remove to skip the
3410          * followup dirrem.
3411          */
3412         if (isrmdir > 1)
3413                 dirrem->dm_state |= DIRCHG;
3414
3415         /*
3416          * Whiteouts have no additional dependencies,
3417          * so just put the dirrem on the correct list.
3418          */
3419         if (newinum == WINO) {
3420                 if ((dirrem->dm_state & COMPLETE) == 0) {
3421                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
3422                             dm_next);
3423                 } else {
3424                         dirrem->dm_dirinum = pagedep->pd_ino;
3425                         add_to_worklist(&dirrem->dm_list);
3426                 }
3427                 FREE_LOCK(&lk);
3428                 return;
3429         }
3430
3431         /*
3432          * If the COMPLETE flag is clear, then there were no active
3433          * entries and we want to roll back to the previous inode until
3434          * the new inode is committed to disk. If the COMPLETE flag is
3435          * set, then we have deleted an entry that never made it to disk.
3436          * If the entry we deleted resulted from a name change, then the old
3437          * inode reference still resides on disk. Any rollback that we do
3438          * needs to be to that old inode (returned to us in prevdirrem). If
3439          * the entry we deleted resulted from a create, then there is
3440          * no entry on the disk, so we want to roll back to zero rather
3441          * than the uncommitted inode. In either of the COMPLETE cases we
3442          * want to immediately free the unwritten and unreferenced inode.
3443          */
3444         if ((dirrem->dm_state & COMPLETE) == 0) {
3445                 dap->da_previous = dirrem;
3446         } else {
3447                 if (prevdirrem != NULL) {
3448                         dap->da_previous = prevdirrem;
3449                 } else {
3450                         dap->da_state &= ~DIRCHG;
3451                         dap->da_pagedep = pagedep;
3452                 }
3453                 dirrem->dm_dirinum = pagedep->pd_ino;
3454                 add_to_worklist(&dirrem->dm_list);
3455         }
3456         /*
3457          * Link into its inodedep. Put it on the id_bufwait list if the inode
3458          * is not yet written. If it is written, do the post-inode write
3459          * processing to put it on the id_pendinghd list.
3460          */
3461         if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
3462             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3463                 dap->da_state |= COMPLETE;
3464                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3465                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3466         } else {
3467                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3468                     dap, da_pdlist);
3469                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3470         }
3471         FREE_LOCK(&lk);
3472 }
3473
3474 /*
3475  * Called whenever the link count on an inode is changed.
3476  * It creates an inode dependency so that the new reference(s)
3477  * to the inode cannot be committed to disk until the updated
3478  * inode has been written.
3479  */
3480 void
3481 softdep_change_linkcnt(ip)
3482         struct inode *ip;       /* the inode with the increased link count */
3483 {
3484         struct inodedep *inodedep;
3485
3486         ACQUIRE_LOCK(&lk);
3487         (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3488             DEPALLOC, &inodedep);
3489         if (ip->i_nlink < ip->i_effnlink)
3490                 panic("softdep_change_linkcnt: bad delta");
3491         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3492         FREE_LOCK(&lk);
3493 }
3494
3495 /*
3496  * Called when the effective link count and the reference count
3497  * on an inode drops to zero. At this point there are no names
3498  * referencing the file in the filesystem and no active file
3499  * references. The space associated with the file will be freed
3500  * as soon as the necessary soft dependencies are cleared.
3501  */
3502 void
3503 softdep_releasefile(ip)
3504         struct inode *ip;       /* inode with the zero effective link count */
3505 {
3506         struct inodedep *inodedep;
3507         struct fs *fs;
3508         int extblocks;
3509
3510         if (ip->i_effnlink > 0)
3511                 panic("softdep_releasefile: file still referenced");
3512         /*
3513          * We may be called several times as the on-disk link count
3514          * drops to zero. We only want to account for the space once.
3515          */
3516         if (ip->i_flag & IN_SPACECOUNTED)
3517                 return;
3518         /*
3519          * We have to deactivate a snapshot otherwise copyonwrites may
3520          * add blocks and the cleanup may remove blocks after we have
3521          * tried to account for them.
3522          */
3523         if ((ip->i_flags & SF_SNAPSHOT) != 0)
3524                 ffs_snapremove(ITOV(ip));
3525         /*
3526          * If we are tracking an nlinkdelta, we have to also remember
3527          * whether we accounted for the freed space yet.
3528          */
3529         ACQUIRE_LOCK(&lk);
3530         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
3531                 inodedep->id_state |= SPACECOUNTED;
3532         FREE_LOCK(&lk);
3533         fs = ip->i_fs;
3534         extblocks = 0;
3535         if (fs->fs_magic == FS_UFS2_MAGIC)
3536                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
3537         UFS_LOCK(ip->i_ump);
3538         ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
3539         ip->i_fs->fs_pendinginodes += 1;
3540         UFS_UNLOCK(ip->i_ump);
3541         ip->i_flag |= IN_SPACECOUNTED;
3542 }
3543
3544 /*
3545  * This workitem decrements the inode's link count.
3546  * If the link count reaches zero, the file is removed.
3547  */
3548 static void
3549 handle_workitem_remove(dirrem, xp)
3550         struct dirrem *dirrem;
3551         struct vnode *xp;
3552 {
3553         struct thread *td = curthread;
3554         struct inodedep *inodedep;
3555         struct vnode *vp;
3556         struct inode *ip;
3557         ino_t oldinum;
3558         int error;
3559
3560         if ((vp = xp) == NULL &&
3561             (error = ffs_vget(dirrem->dm_list.wk_mp,
3562             dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
3563                 softdep_error("handle_workitem_remove: vget", error);
3564                 return;
3565         }
3566         ip = VTOI(vp);
3567         ACQUIRE_LOCK(&lk);
3568         if ((inodedep_lookup(dirrem->dm_list.wk_mp,
3569             dirrem->dm_oldinum, 0, &inodedep)) == 0)
3570                 panic("handle_workitem_remove: lost inodedep");
3571         /*
3572          * Normal file deletion.
3573          */
3574         if ((dirrem->dm_state & RMDIR) == 0) {
3575                 ip->i_nlink--;
3576                 DIP_SET(ip, i_nlink, ip->i_nlink);
3577                 ip->i_flag |= IN_CHANGE;
3578                 if (ip->i_nlink < ip->i_effnlink)
3579                         panic("handle_workitem_remove: bad file delta");
3580                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3581                 num_dirrem -= 1;
3582                 WORKITEM_FREE(dirrem, D_DIRREM);
3583                 FREE_LOCK(&lk);
3584                 vput(vp);
3585                 return;
3586         }
3587         /*
3588          * Directory deletion. Decrement reference count for both the
3589          * just deleted parent directory entry and the reference for ".".
3590          * Next truncate the directory to length zero. When the
3591          * truncation completes, arrange to have the reference count on
3592          * the parent decremented to account for the loss of "..".
3593          */
3594         ip->i_nlink -= 2;
3595         DIP_SET(ip, i_nlink, ip->i_nlink);
3596         ip->i_flag |= IN_CHANGE;
3597         if (ip->i_nlink < ip->i_effnlink)
3598                 panic("handle_workitem_remove: bad dir delta");
3599         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3600         FREE_LOCK(&lk);
3601         if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3602                 softdep_error("handle_workitem_remove: truncate", error);
3603         ACQUIRE_LOCK(&lk);
3604         /*
3605          * Rename a directory to a new parent. Since, we are both deleting
3606          * and creating a new directory entry, the link count on the new
3607          * directory should not change. Thus we skip the followup dirrem.
3608          */
3609         if (dirrem->dm_state & DIRCHG) {
3610                 num_dirrem -= 1;
3611                 WORKITEM_FREE(dirrem, D_DIRREM);
3612                 FREE_LOCK(&lk);
3613                 vput(vp);
3614                 return;
3615         }
3616         /*
3617          * If the inodedep does not exist, then the zero'ed inode has
3618          * been written to disk. If the allocated inode has never been
3619          * written to disk, then the on-disk inode is zero'ed. In either
3620          * case we can remove the file immediately.
3621          */
3622         dirrem->dm_state = 0;
3623         oldinum = dirrem->dm_oldinum;
3624         dirrem->dm_oldinum = dirrem->dm_dirinum;
3625         if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
3626             0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
3627                 if (xp != NULL)
3628                         add_to_worklist(&dirrem->dm_list);
3629                 FREE_LOCK(&lk);
3630                 vput(vp);
3631                 if (xp == NULL)
3632                         handle_workitem_remove(dirrem, NULL);
3633                 return;
3634         }
3635         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3636         FREE_LOCK(&lk);
3637         ip->i_flag |= IN_CHANGE;
3638         ffs_update(vp, 0);
3639         vput(vp);
3640 }
3641
3642 /*
3643  * Inode de-allocation dependencies.
3644  *
3645  * When an inode's link count is reduced to zero, it can be de-allocated. We
3646  * found it convenient to postpone de-allocation until after the inode is
3647  * written to disk with its new link count (zero).  At this point, all of the
3648  * on-disk inode's block pointers are nullified and, with careful dependency
3649  * list ordering, all dependencies related to the inode will be satisfied and
3650  * the corresponding dependency structures de-allocated.  So, if/when the
3651  * inode is reused, there will be no mixing of old dependencies with new
3652  * ones.  This artificial dependency is set up by the block de-allocation
3653  * procedure above (softdep_setup_freeblocks) and completed by the
3654  * following procedure.
3655  */
3656 static void
3657 handle_workitem_freefile(freefile)
3658         struct freefile *freefile;
3659 {
3660         struct fs *fs;
3661         struct inodedep *idp;
3662         struct ufsmount *ump;
3663         int error;
3664
3665         ump = VFSTOUFS(freefile->fx_list.wk_mp);
3666         fs = ump->um_fs;
3667 #ifdef DEBUG
3668         ACQUIRE_LOCK(&lk);
3669         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
3670         FREE_LOCK(&lk);
3671         if (error)
3672                 panic("handle_workitem_freefile: inodedep survived");
3673 #endif
3674         UFS_LOCK(ump);
3675         fs->fs_pendinginodes -= 1;
3676         UFS_UNLOCK(ump);
3677         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
3678             freefile->fx_oldinum, freefile->fx_mode)) != 0)
3679                 softdep_error("handle_workitem_freefile", error);
3680         ACQUIRE_LOCK(&lk);
3681         WORKITEM_FREE(freefile, D_FREEFILE);
3682         FREE_LOCK(&lk);
3683 }
3684
3685
3686 /*
3687  * Helper function which unlinks marker element from work list and returns
3688  * the next element on the list.
3689  */
3690 static __inline struct worklist *
3691 markernext(struct worklist *marker)
3692 {
3693         struct worklist *next;
3694
3695         next = LIST_NEXT(marker, wk_list);
3696         LIST_REMOVE(marker, wk_list);
3697         return next;
3698 }
3699
3700 /*
3701  * Disk writes.
3702  *
3703  * The dependency structures constructed above are most actively used when file
3704  * system blocks are written to disk.  No constraints are placed on when a
3705  * block can be written, but unsatisfied update dependencies are made safe by
3706  * modifying (or replacing) the source memory for the duration of the disk
3707  * write.  When the disk write completes, the memory block is again brought
3708  * up-to-date.
3709  *
3710  * In-core inode structure reclamation.
3711  *
3712  * Because there are a finite number of "in-core" inode structures, they are
3713  * reused regularly.  By transferring all inode-related dependencies to the
3714  * in-memory inode block and indexing them separately (via "inodedep"s), we
3715  * can allow "in-core" inode structures to be reused at any time and avoid
3716  * any increase in contention.
3717  *
3718  * Called just before entering the device driver to initiate a new disk I/O.
3719  * The buffer must be locked, thus, no I/O completion operations can occur
3720  * while we are manipulating its associated dependencies.
3721  */
3722 static void
3723 softdep_disk_io_initiation(bp)
3724         struct buf *bp;         /* structure describing disk write to occur */
3725 {
3726         struct worklist *wk;
3727         struct worklist marker;
3728         struct indirdep *indirdep;
3729         struct inodedep *inodedep;
3730
3731         /*
3732          * We only care about write operations. There should never
3733          * be dependencies for reads.
3734          */
3735         if (bp->b_iocmd != BIO_WRITE)
3736                 panic("softdep_disk_io_initiation: not write");
3737
3738         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
3739         PHOLD(curproc);                 /* Don't swap out kernel stack */
3740
3741         ACQUIRE_LOCK(&lk);
3742         /*
3743          * Do any necessary pre-I/O processing.
3744          */
3745         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
3746              wk = markernext(&marker)) {
3747                 LIST_INSERT_AFTER(wk, &marker, wk_list);
3748                 switch (wk->wk_type) {
3749
3750                 case D_PAGEDEP:
3751                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
3752                         continue;
3753
3754                 case D_INODEDEP:
3755                         inodedep = WK_INODEDEP(wk);
3756                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3757                                 initiate_write_inodeblock_ufs1(inodedep, bp);
3758                         else
3759                                 initiate_write_inodeblock_ufs2(inodedep, bp);
3760                         continue;
3761
3762                 case D_INDIRDEP:
3763                         indirdep = WK_INDIRDEP(wk);
3764                         if (indirdep->ir_state & GOINGAWAY)
3765                                 panic("disk_io_initiation: indirdep gone");
3766                         /*
3767                          * If there are no remaining dependencies, this
3768                          * will be writing the real pointers, so the
3769                          * dependency can be freed.
3770                          */
3771                         if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
3772                                 struct buf *bp;
3773
3774                                 bp = indirdep->ir_savebp;
3775                                 bp->b_flags |= B_INVAL | B_NOCACHE;
3776                                 /* inline expand WORKLIST_REMOVE(wk); */
3777                                 wk->wk_state &= ~ONWORKLIST;
3778                                 LIST_REMOVE(wk, wk_list);
3779                                 WORKITEM_FREE(indirdep, D_INDIRDEP);
3780                                 FREE_LOCK(&lk);
3781                                 brelse(bp);
3782                                 ACQUIRE_LOCK(&lk);
3783                                 continue;
3784                         }
3785                         /*
3786                          * Replace up-to-date version with safe version.
3787                          */
3788                         FREE_LOCK(&lk);
3789                         MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3790                             M_INDIRDEP, M_SOFTDEP_FLAGS);
3791                         ACQUIRE_LOCK(&lk);
3792                         indirdep->ir_state &= ~ATTACHED;
3793                         indirdep->ir_state |= UNDONE;
3794                         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3795                         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3796                             bp->b_bcount);
3797                         continue;
3798
3799                 case D_MKDIR:
3800                 case D_BMSAFEMAP:
3801                 case D_ALLOCDIRECT:
3802                 case D_ALLOCINDIR:
3803                         continue;
3804
3805                 default:
3806                         panic("handle_disk_io_initiation: Unexpected type %s",
3807                             TYPENAME(wk->wk_type));
3808                         /* NOTREACHED */
3809                 }
3810         }
3811         FREE_LOCK(&lk);
3812         PRELE(curproc);                 /* Allow swapout of kernel stack */
3813 }
3814
3815 /*
3816  * Called from within the procedure above to deal with unsatisfied
3817  * allocation dependencies in a directory. The buffer must be locked,
3818  * thus, no I/O completion operations can occur while we are
3819  * manipulating its associated dependencies.
3820  */
3821 static void
3822 initiate_write_filepage(pagedep, bp)
3823         struct pagedep *pagedep;
3824         struct buf *bp;
3825 {
3826         struct diradd *dap;
3827         struct direct *ep;
3828         int i;
3829
3830         if (pagedep->pd_state & IOSTARTED) {
3831                 /*
3832                  * This can only happen if there is a driver that does not
3833                  * understand chaining. Here biodone will reissue the call
3834                  * to strategy for the incomplete buffers.
3835                  */
3836                 printf("initiate_write_filepage: already started\n");
3837                 return;
3838         }
3839         pagedep->pd_state |= IOSTARTED;
3840         for (i = 0; i < DAHASHSZ; i++) {
3841                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3842                         ep = (struct direct *)
3843                             ((char *)bp->b_data + dap->da_offset);
3844                         if (ep->d_ino != dap->da_newinum)
3845                                 panic("%s: dir inum %d != new %d",
3846                                     "initiate_write_filepage",
3847                                     ep->d_ino, dap->da_newinum);
3848                         if (dap->da_state & DIRCHG)
3849                                 ep->d_ino = dap->da_previous->dm_oldinum;
3850                         else
3851                                 ep->d_ino = 0;
3852                         dap->da_state &= ~ATTACHED;
3853                         dap->da_state |= UNDONE;
3854                 }
3855         }
3856 }
3857
3858 /*
3859  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
3860  * Note that any bug fixes made to this routine must be done in the
3861  * version found below.
3862  *
3863  * Called from within the procedure above to deal with unsatisfied
3864  * allocation dependencies in an inodeblock. The buffer must be
3865  * locked, thus, no I/O completion operations can occur while we
3866  * are manipulating its associated dependencies.
3867  */
3868 static void
3869 initiate_write_inodeblock_ufs1(inodedep, bp)
3870         struct inodedep *inodedep;
3871         struct buf *bp;                 /* The inode block */
3872 {
3873         struct allocdirect *adp, *lastadp;
3874         struct ufs1_dinode *dp;
3875         struct ufs1_dinode *sip;
3876         struct fs *fs;
3877         ufs_lbn_t i;
3878 #ifdef INVARIANTS
3879         ufs_lbn_t prevlbn = 0;
3880 #endif
3881         int deplist;
3882
3883         if (inodedep->id_state & IOSTARTED)
3884                 panic("initiate_write_inodeblock_ufs1: already started");
3885         inodedep->id_state |= IOSTARTED;
3886         fs = inodedep->id_fs;
3887         dp = (struct ufs1_dinode *)bp->b_data +
3888             ino_to_fsbo(fs, inodedep->id_ino);
3889         /*
3890          * If the bitmap is not yet written, then the allocated
3891          * inode cannot be written to disk.
3892          */
3893         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3894                 if (inodedep->id_savedino1 != NULL)
3895                         panic("initiate_write_inodeblock_ufs1: I/O underway");
3896                 FREE_LOCK(&lk);
3897                 MALLOC(sip, struct ufs1_dinode *,
3898                     sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
3899                 ACQUIRE_LOCK(&lk);
3900                 inodedep->id_savedino1 = sip;
3901                 *inodedep->id_savedino1 = *dp;
3902                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3903                 dp->di_gen = inodedep->id_savedino1->di_gen;
3904                 return;
3905         }
3906         /*
3907          * If no dependencies, then there is nothing to roll back.
3908          */
3909         inodedep->id_savedsize = dp->di_size;
3910         inodedep->id_savedextsize = 0;
3911         if (TAILQ_EMPTY(&inodedep->id_inoupdt))
3912                 return;
3913         /*
3914          * Set the dependencies to busy.
3915          */
3916         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3917              adp = TAILQ_NEXT(adp, ad_next)) {
3918 #ifdef INVARIANTS
3919                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
3920                         panic("softdep_write_inodeblock: lbn order");
3921                 prevlbn = adp->ad_lbn;
3922                 if (adp->ad_lbn < NDADDR &&
3923                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
3924                         panic("%s: direct pointer #%jd mismatch %d != %jd",
3925                             "softdep_write_inodeblock",
3926                             (intmax_t)adp->ad_lbn,
3927                             dp->di_db[adp->ad_lbn],
3928                             (intmax_t)adp->ad_newblkno);
3929                 if (adp->ad_lbn >= NDADDR &&
3930                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
3931                         panic("%s: indirect pointer #%jd mismatch %d != %jd",
3932                             "softdep_write_inodeblock",
3933                             (intmax_t)adp->ad_lbn - NDADDR,
3934                             dp->di_ib[adp->ad_lbn - NDADDR],
3935                             (intmax_t)adp->ad_newblkno);
3936                 deplist |= 1 << adp->ad_lbn;
3937                 if ((adp->ad_state & ATTACHED) == 0)
3938                         panic("softdep_write_inodeblock: Unknown state 0x%x",
3939                             adp->ad_state);
3940 #endif /* INVARIANTS */
3941                 adp->ad_state &= ~ATTACHED;
3942                 adp->ad_state |= UNDONE;
3943         }
3944         /*
3945          * The on-disk inode cannot claim to be any larger than the last
3946          * fragment that has been written. Otherwise, the on-disk inode
3947          * might have fragments that were not the last block in the file
3948          * which would corrupt the filesystem.
3949          */
3950         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3951              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3952                 if (adp->ad_lbn >= NDADDR)
3953                         break;
3954                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3955                 /* keep going until hitting a rollback to a frag */
3956                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3957                         continue;
3958                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3959                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3960 #ifdef INVARIANTS
3961                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
3962                                 panic("softdep_write_inodeblock: lost dep1");
3963 #endif /* INVARIANTS */
3964                         dp->di_db[i] = 0;
3965                 }
3966                 for (i = 0; i < NIADDR; i++) {
3967 #ifdef INVARIANTS
3968                         if (dp->di_ib[i] != 0 &&
3969                             (deplist & ((1 << NDADDR) << i)) == 0)
3970                                 panic("softdep_write_inodeblock: lost dep2");
3971 #endif /* INVARIANTS */
3972                         dp->di_ib[i] = 0;
3973                 }
3974                 return;
3975         }
3976         /*
3977          * If we have zero'ed out the last allocated block of the file,
3978          * roll back the size to the last currently allocated block.
3979          * We know that this last allocated block is a full-sized as
3980          * we already checked for fragments in the loop above.
3981          */
3982         if (lastadp != NULL &&
3983             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3984                 for (i = lastadp->ad_lbn; i >= 0; i--)
3985                         if (dp->di_db[i] != 0)
3986                                 break;
3987                 dp->di_size = (i + 1) * fs->fs_bsize;
3988         }
3989         /*
3990          * The only dependencies are for indirect blocks.
3991          *
3992          * The file size for indirect block additions is not guaranteed.
3993          * Such a guarantee would be non-trivial to achieve. The conventional
3994          * synchronous write implementation also does not make this guarantee.
3995          * Fsck should catch and fix discrepancies. Arguably, the file size
3996          * can be over-estimated without destroying integrity when the file
3997          * moves into the indirect blocks (i.e., is large). If we want to
3998          * postpone fsck, we are stuck with this argument.
3999          */
4000         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4001                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4002 }
4003
4004 /*
4005  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
4006  * Note that any bug fixes made to this routine must be done in the
4007  * version found above.
4008  *
4009  * Called from within the procedure above to deal with unsatisfied
4010  * allocation dependencies in an inodeblock. The buffer must be
4011  * locked, thus, no I/O completion operations can occur while we
4012  * are manipulating its associated dependencies.
4013  */
4014 static void
4015 initiate_write_inodeblock_ufs2(inodedep, bp)
4016         struct inodedep *inodedep;
4017         struct buf *bp;                 /* The inode block */
4018 {
4019         struct allocdirect *adp, *lastadp;
4020         struct ufs2_dinode *dp;
4021         struct ufs2_dinode *sip;
4022         struct fs *fs;
4023         ufs_lbn_t i;
4024 #ifdef INVARIANTS
4025         ufs_lbn_t prevlbn = 0;
4026 #endif
4027         int deplist;
4028
4029         if (inodedep->id_state & IOSTARTED)
4030                 panic("initiate_write_inodeblock_ufs2: already started");
4031         inodedep->id_state |= IOSTARTED;
4032         fs = inodedep->id_fs;
4033         dp = (struct ufs2_dinode *)bp->b_data +
4034             ino_to_fsbo(fs, inodedep->id_ino);
4035         /*
4036          * If the bitmap is not yet written, then the allocated
4037          * inode cannot be written to disk.
4038          */
4039         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4040                 if (inodedep->id_savedino2 != NULL)
4041                         panic("initiate_write_inodeblock_ufs2: I/O underway");
4042                 FREE_LOCK(&lk);
4043                 MALLOC(sip, struct ufs2_dinode *,
4044                     sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
4045                 ACQUIRE_LOCK(&lk);
4046                 inodedep->id_savedino2 = sip;
4047                 *inodedep->id_savedino2 = *dp;
4048                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
4049                 dp->di_gen = inodedep->id_savedino2->di_gen;
4050                 return;
4051         }
4052         /*
4053          * If no dependencies, then there is nothing to roll back.
4054          */
4055         inodedep->id_savedsize = dp->di_size;
4056         inodedep->id_savedextsize = dp->di_extsize;
4057         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
4058             TAILQ_EMPTY(&inodedep->id_extupdt))
4059                 return;
4060         /*
4061          * Set the ext data dependencies to busy.
4062          */
4063         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4064              adp = TAILQ_NEXT(adp, ad_next)) {
4065 #ifdef INVARIANTS
4066                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
4067                         panic("softdep_write_inodeblock: lbn order");
4068                 prevlbn = adp->ad_lbn;
4069                 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
4070                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
4071                             "softdep_write_inodeblock",
4072                             (intmax_t)adp->ad_lbn,
4073                             (intmax_t)dp->di_extb[adp->ad_lbn],
4074                             (intmax_t)adp->ad_newblkno);
4075                 deplist |= 1 << adp->ad_lbn;
4076                 if ((adp->ad_state & ATTACHED) == 0)
4077                         panic("softdep_write_inodeblock: Unknown state 0x%x",
4078                             adp->ad_state);
4079 #endif /* INVARIANTS */
4080                 adp->ad_state &= ~ATTACHED;
4081                 adp->ad_state |= UNDONE;
4082         }
4083         /*
4084          * The on-disk inode cannot claim to be any larger than the last
4085          * fragment that has been written. Otherwise, the on-disk inode
4086          * might have fragments that were not the last block in the ext
4087          * data which would corrupt the filesystem.
4088          */
4089         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4090              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4091                 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
4092                 /* keep going until hitting a rollback to a frag */
4093                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4094                         continue;
4095                 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4096                 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
4097 #ifdef INVARIANTS
4098                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
4099                                 panic("softdep_write_inodeblock: lost dep1");
4100 #endif /* INVARIANTS */
4101                         dp->di_extb[i] = 0;
4102                 }
4103                 lastadp = NULL;
4104                 break;
4105         }
4106         /*
4107          * If we have zero'ed out the last allocated block of the ext
4108          * data, roll back the size to the last currently allocated block.
4109          * We know that this last allocated block is a full-sized as
4110          * we already checked for fragments in the loop above.
4111          */
4112         if (lastadp != NULL &&
4113             dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4114                 for (i = lastadp->ad_lbn; i >= 0; i--)
4115                         if (dp->di_extb[i] != 0)
4116                                 break;
4117                 dp->di_extsize = (i + 1) * fs->fs_bsize;
4118         }
4119         /*
4120          * Set the file data dependencies to busy.
4121          */
4122         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4123              adp = TAILQ_NEXT(adp, ad_next)) {
4124 #ifdef INVARIANTS
4125                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
4126                         panic("softdep_write_inodeblock: lbn order");
4127                 prevlbn = adp->ad_lbn;
4128                 if (adp->ad_lbn < NDADDR &&
4129                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
4130                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
4131                             "softdep_write_inodeblock",
4132                             (intmax_t)adp->ad_lbn,
4133                             (intmax_t)dp->di_db[adp->ad_lbn],
4134                             (intmax_t)adp->ad_newblkno);
4135                 if (adp->ad_lbn >= NDADDR &&
4136                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
4137                         panic("%s indirect pointer #%jd mismatch %jd != %jd",
4138                             "softdep_write_inodeblock:",
4139                             (intmax_t)adp->ad_lbn - NDADDR,
4140                             (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
4141                             (intmax_t)adp->ad_newblkno);
4142                 deplist |= 1 << adp->ad_lbn;
4143                 if ((adp->ad_state & ATTACHED) == 0)
4144                         panic("softdep_write_inodeblock: Unknown state 0x%x",
4145                             adp->ad_state);
4146 #endif /* INVARIANTS */
4147                 adp->ad_state &= ~ATTACHED;
4148                 adp->ad_state |= UNDONE;
4149         }
4150         /*
4151          * The on-disk inode cannot claim to be any larger than the last
4152          * fragment that has been written. Otherwise, the on-disk inode
4153          * might have fragments that were not the last block in the file
4154          * which would corrupt the filesystem.
4155          */
4156         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4157              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4158                 if (adp->ad_lbn >= NDADDR)
4159                         break;
4160                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
4161                 /* keep going until hitting a rollback to a frag */
4162                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4163                         continue;
4164                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4165                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
4166 #ifdef INVARIANTS
4167                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
4168                                 panic("softdep_write_inodeblock: lost dep2");
4169 #endif /* INVARIANTS */
4170                         dp->di_db[i] = 0;
4171                 }
4172                 for (i = 0; i < NIADDR; i++) {
4173 #ifdef INVARIANTS
4174                         if (dp->di_ib[i] != 0 &&
4175                             (deplist & ((1 << NDADDR) << i)) == 0)
4176                                 panic("softdep_write_inodeblock: lost dep3");
4177 #endif /* INVARIANTS */
4178                         dp->di_ib[i] = 0;
4179                 }
4180                 return;
4181         }
4182         /*
4183          * If we have zero'ed out the last allocated block of the file,
4184          * roll back the size to the last currently allocated block.
4185          * We know that this last allocated block is a full-sized as
4186          * we already checked for fragments in the loop above.
4187          */
4188         if (lastadp != NULL &&
4189             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4190                 for (i = lastadp->ad_lbn; i >= 0; i--)
4191                         if (dp->di_db[i] != 0)
4192                                 break;
4193                 dp->di_size = (i + 1) * fs->fs_bsize;
4194         }
4195         /*
4196          * The only dependencies are for indirect blocks.
4197          *
4198          * The file size for indirect block additions is not guaranteed.
4199          * Such a guarantee would be non-trivial to achieve. The conventional
4200          * synchronous write implementation also does not make this guarantee.
4201          * Fsck should catch and fix discrepancies. Arguably, the file size
4202          * can be over-estimated without destroying integrity when the file
4203          * moves into the indirect blocks (i.e., is large). If we want to
4204          * postpone fsck, we are stuck with this argument.
4205          */
4206         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4207                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4208 }
4209
4210 /*
4211  * This routine is called during the completion interrupt
4212  * service routine for a disk write (from the procedure called
4213  * by the device driver to inform the filesystem caches of
4214  * a request completion).  It should be called early in this
4215  * procedure, before the block is made available to other
4216  * processes or other routines are called.
4217  */
4218 static void
4219 softdep_disk_write_complete(bp)
4220         struct buf *bp;         /* describes the completed disk write */
4221 {
4222         struct worklist *wk;
4223         struct worklist *owk;
4224         struct workhead reattach;
4225         struct newblk *newblk;
4226         struct allocindir *aip;
4227         struct allocdirect *adp;
4228         struct indirdep *indirdep;
4229         struct inodedep *inodedep;
4230         struct bmsafemap *bmsafemap;
4231
4232         /*
4233          * If an error occurred while doing the write, then the data
4234          * has not hit the disk and the dependencies cannot be unrolled.
4235          */
4236         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
4237                 return;
4238         LIST_INIT(&reattach);
4239         /*
4240          * This lock must not be released anywhere in this code segment.
4241          */
4242         ACQUIRE_LOCK(&lk);
4243         owk = NULL;
4244         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4245                 WORKLIST_REMOVE(wk);
4246                 if (wk == owk)
4247                         panic("duplicate worklist: %p\n", wk);
4248                 owk = wk;
4249                 switch (wk->wk_type) {
4250
4251                 case D_PAGEDEP:
4252                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
4253                                 WORKLIST_INSERT(&reattach, wk);
4254                         continue;
4255
4256                 case D_INODEDEP:
4257                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
4258                                 WORKLIST_INSERT(&reattach, wk);
4259                         continue;
4260
4261                 case D_BMSAFEMAP:
4262                         bmsafemap = WK_BMSAFEMAP(wk);
4263                         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
4264                                 newblk->nb_state |= DEPCOMPLETE;
4265                                 newblk->nb_bmsafemap = NULL;
4266                                 LIST_REMOVE(newblk, nb_deps);
4267                         }
4268                         while ((adp =
4269                            LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
4270                                 adp->ad_state |= DEPCOMPLETE;
4271                                 adp->ad_buf = NULL;
4272                                 LIST_REMOVE(adp, ad_deps);
4273                                 handle_allocdirect_partdone(adp);
4274                         }
4275                         while ((aip =
4276                             LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
4277                                 aip->ai_state |= DEPCOMPLETE;
4278                                 aip->ai_buf = NULL;
4279                                 LIST_REMOVE(aip, ai_deps);
4280                                 handle_allocindir_partdone(aip);
4281                         }
4282                         while ((inodedep =
4283                              LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
4284                                 inodedep->id_state |= DEPCOMPLETE;
4285                                 LIST_REMOVE(inodedep, id_deps);
4286                                 inodedep->id_buf = NULL;
4287                         }
4288                         WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4289                         continue;
4290
4291                 case D_MKDIR:
4292                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
4293                         continue;
4294
4295                 case D_ALLOCDIRECT:
4296                         adp = WK_ALLOCDIRECT(wk);
4297                         adp->ad_state |= COMPLETE;
4298                         handle_allocdirect_partdone(adp);
4299                         continue;
4300
4301                 case D_ALLOCINDIR:
4302                         aip = WK_ALLOCINDIR(wk);
4303                         aip->ai_state |= COMPLETE;
4304                         handle_allocindir_partdone(aip);
4305                         continue;
4306
4307                 case D_INDIRDEP:
4308                         indirdep = WK_INDIRDEP(wk);
4309                         if (indirdep->ir_state & GOINGAWAY)
4310                                 panic("disk_write_complete: indirdep gone");
4311                         bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
4312                         FREE(indirdep->ir_saveddata, M_INDIRDEP);
4313                         indirdep->ir_saveddata = 0;
4314                         indirdep->ir_state &= ~UNDONE;
4315                         indirdep->ir_state |= ATTACHED;
4316                         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
4317                                 handle_allocindir_partdone(aip);
4318                                 if (aip == LIST_FIRST(&indirdep->ir_donehd))
4319                                         panic("disk_write_complete: not gone");
4320                         }
4321                         WORKLIST_INSERT(&reattach, wk);
4322                         if ((bp->b_flags & B_DELWRI) == 0)
4323                                 stat_indir_blk_ptrs++;
4324                         bdirty(bp);
4325                         continue;
4326
4327                 default:
4328                         panic("handle_disk_write_complete: Unknown type %s",
4329                             TYPENAME(wk->wk_type));
4330                         /* NOTREACHED */
4331                 }
4332         }
4333         /*
4334          * Reattach any requests that must be redone.
4335          */
4336         while ((wk = LIST_FIRST(&reattach)) != NULL) {
4337                 WORKLIST_REMOVE(wk);
4338                 WORKLIST_INSERT(&bp->b_dep, wk);
4339         }
4340         FREE_LOCK(&lk);
4341 }
4342
4343 /*
4344  * Called from within softdep_disk_write_complete above. Note that
4345  * this routine is always called from interrupt level with further
4346  * splbio interrupts blocked.
4347  */
4348 static void
4349 handle_allocdirect_partdone(adp)
4350         struct allocdirect *adp;        /* the completed allocdirect */
4351 {
4352         struct allocdirectlst *listhead;
4353         struct allocdirect *listadp;
4354         struct inodedep *inodedep;
4355         long bsize, delay;
4356
4357         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4358                 return;
4359         if (adp->ad_buf != NULL)
4360                 panic("handle_allocdirect_partdone: dangling dep");
4361         /*
4362          * The on-disk inode cannot claim to be any larger than the last
4363          * fragment that has been written. Otherwise, the on-disk inode
4364          * might have fragments that were not the last block in the file
4365          * which would corrupt the filesystem. Thus, we cannot free any
4366          * allocdirects after one whose ad_oldblkno claims a fragment as
4367          * these blocks must be rolled back to zero before writing the inode.
4368          * We check the currently active set of allocdirects in id_inoupdt
4369          * or id_extupdt as appropriate.
4370          */
4371         inodedep = adp->ad_inodedep;
4372         bsize = inodedep->id_fs->fs_bsize;
4373         if (adp->ad_state & EXTDATA)
4374                 listhead = &inodedep->id_extupdt;
4375         else
4376                 listhead = &inodedep->id_inoupdt;
4377         TAILQ_FOREACH(listadp, listhead, ad_next) {
4378                 /* found our block */
4379                 if (listadp == adp)
4380                         break;
4381                 /* continue if ad_oldlbn is not a fragment */
4382                 if (listadp->ad_oldsize == 0 ||
4383                     listadp->ad_oldsize == bsize)
4384                         continue;
4385                 /* hit a fragment */
4386                 return;
4387         }
4388         /*
4389          * If we have reached the end of the current list without
4390          * finding the just finished dependency, then it must be
4391          * on the future dependency list. Future dependencies cannot
4392          * be freed until they are moved to the current list.
4393          */
4394         if (listadp == NULL) {
4395 #ifdef DEBUG
4396                 if (adp->ad_state & EXTDATA)
4397                         listhead = &inodedep->id_newextupdt;
4398                 else
4399                         listhead = &inodedep->id_newinoupdt;
4400                 TAILQ_FOREACH(listadp, listhead, ad_next)
4401                         /* found our block */
4402                         if (listadp == adp)
4403                                 break;
4404                 if (listadp == NULL)
4405                         panic("handle_allocdirect_partdone: lost dep");
4406 #endif /* DEBUG */
4407                 return;
4408         }
4409         /*
4410          * If we have found the just finished dependency, then free
4411          * it along with anything that follows it that is complete.
4412          * If the inode still has a bitmap dependency, then it has
4413          * never been written to disk, hence the on-disk inode cannot
4414          * reference the old fragment so we can free it without delay.
4415          */
4416         delay = (inodedep->id_state & DEPCOMPLETE);
4417         for (; adp; adp = listadp) {
4418                 listadp = TAILQ_NEXT(adp, ad_next);
4419                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4420                         return;
4421                 free_allocdirect(listhead, adp, delay);
4422         }
4423 }
4424
4425 /*
4426  * Called from within softdep_disk_write_complete above. Note that
4427  * this routine is always called from interrupt level with further
4428  * splbio interrupts blocked.
4429  */
4430 static void
4431 handle_allocindir_partdone(aip)
4432         struct allocindir *aip;         /* the completed allocindir */
4433 {
4434         struct indirdep *indirdep;
4435
4436         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4437                 return;
4438         if (aip->ai_buf != NULL)
4439                 panic("handle_allocindir_partdone: dangling dependency");
4440         indirdep = aip->ai_indirdep;
4441         if (indirdep->ir_state & UNDONE) {
4442                 LIST_REMOVE(aip, ai_next);
4443                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4444                 return;
4445         }
4446         if (indirdep->ir_state & UFS1FMT)
4447                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4448                     aip->ai_newblkno;
4449         else
4450                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4451                     aip->ai_newblkno;
4452         LIST_REMOVE(aip, ai_next);
4453         if (aip->ai_freefrag != NULL)
4454                 add_to_worklist(&aip->ai_freefrag->ff_list);
4455         WORKITEM_FREE(aip, D_ALLOCINDIR);
4456 }
4457
4458 /*
4459  * Called from within softdep_disk_write_complete above to restore
4460  * in-memory inode block contents to their most up-to-date state. Note
4461  * that this routine is always called from interrupt level with further
4462  * splbio interrupts blocked.
4463  */
4464 static int
4465 handle_written_inodeblock(inodedep, bp)
4466         struct inodedep *inodedep;
4467         struct buf *bp;         /* buffer containing the inode block */
4468 {
4469         struct worklist *wk, *filefree;
4470         struct allocdirect *adp, *nextadp;
4471         struct ufs1_dinode *dp1 = NULL;
4472         struct ufs2_dinode *dp2 = NULL;
4473         int hadchanges, fstype;
4474
4475         if ((inodedep->id_state & IOSTARTED) == 0)
4476                 panic("handle_written_inodeblock: not started");
4477         inodedep->id_state &= ~IOSTARTED;
4478         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4479                 fstype = UFS1;
4480                 dp1 = (struct ufs1_dinode *)bp->b_data +
4481                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4482         } else {
4483                 fstype = UFS2;
4484                 dp2 = (struct ufs2_dinode *)bp->b_data +
4485                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4486         }
4487         /*
4488          * If we had to rollback the inode allocation because of
4489          * bitmaps being incomplete, then simply restore it.
4490          * Keep the block dirty so that it will not be reclaimed until
4491          * all associated dependencies have been cleared and the
4492          * corresponding updates written to disk.
4493          */
4494         if (inodedep->id_savedino1 != NULL) {
4495                 if (fstype == UFS1)
4496                         *dp1 = *inodedep->id_savedino1;
4497                 else
4498                         *dp2 = *inodedep->id_savedino2;
4499                 FREE(inodedep->id_savedino1, M_SAVEDINO);
4500                 inodedep->id_savedino1 = NULL;
4501                 if ((bp->b_flags & B_DELWRI) == 0)
4502                         stat_inode_bitmap++;
4503                 bdirty(bp);
4504                 return (1);
4505         }
4506         inodedep->id_state |= COMPLETE;
4507         /*
4508          * Roll forward anything that had to be rolled back before
4509          * the inode could be updated.
4510          */
4511         hadchanges = 0;
4512         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4513                 nextadp = TAILQ_NEXT(adp, ad_next);
4514                 if (adp->ad_state & ATTACHED)
4515                         panic("handle_written_inodeblock: new entry");
4516                 if (fstype == UFS1) {
4517                         if (adp->ad_lbn < NDADDR) {
4518                                 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4519                                         panic("%s %s #%jd mismatch %d != %jd",
4520                                             "handle_written_inodeblock:",
4521                                             "direct pointer",
4522                                             (intmax_t)adp->ad_lbn,
4523                                             dp1->di_db[adp->ad_lbn],
4524                                             (intmax_t)adp->ad_oldblkno);
4525                                 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4526                         } else {
4527                                 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4528                                         panic("%s: %s #%jd allocated as %d",
4529                                             "handle_written_inodeblock",
4530                                             "indirect pointer",
4531                                             (intmax_t)adp->ad_lbn - NDADDR,
4532                                             dp1->di_ib[adp->ad_lbn - NDADDR]);
4533                                 dp1->di_ib[adp->ad_lbn - NDADDR] =
4534                                     adp->ad_newblkno;
4535                         }
4536                 } else {
4537                         if (adp->ad_lbn < NDADDR) {
4538                                 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4539                                         panic("%s: %s #%jd %s %jd != %jd",
4540                                             "handle_written_inodeblock",
4541                                             "direct pointer",
4542                                             (intmax_t)adp->ad_lbn, "mismatch",
4543                                             (intmax_t)dp2->di_db[adp->ad_lbn],
4544                                             (intmax_t)adp->ad_oldblkno);
4545                                 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4546                         } else {
4547                                 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4548                                         panic("%s: %s #%jd allocated as %jd",
4549                                             "handle_written_inodeblock",
4550                                             "indirect pointer",
4551                                             (intmax_t)adp->ad_lbn - NDADDR,
4552                                             (intmax_t)
4553                                             dp2->di_ib[adp->ad_lbn - NDADDR]);
4554                                 dp2->di_ib[adp->ad_lbn - NDADDR] =
4555                                     adp->ad_newblkno;
4556                         }
4557                 }
4558                 adp->ad_state &= ~UNDONE;
4559                 adp->ad_state |= ATTACHED;
4560                 hadchanges = 1;
4561         }
4562         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
4563                 nextadp = TAILQ_NEXT(adp, ad_next);
4564                 if (adp->ad_state & ATTACHED)
4565                         panic("handle_written_inodeblock: new entry");
4566                 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
4567                         panic("%s: direct pointers #%jd %s %jd != %jd",
4568                             "handle_written_inodeblock",
4569                             (intmax_t)adp->ad_lbn, "mismatch",
4570                             (intmax_t)dp2->di_extb[adp->ad_lbn],
4571                             (intmax_t)adp->ad_oldblkno);
4572                 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
4573                 adp->ad_state &= ~UNDONE;
4574                 adp->ad_state |= ATTACHED;
4575                 hadchanges = 1;
4576         }
4577         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4578                 stat_direct_blk_ptrs++;
4579         /*
4580          * Reset the file size to its most up-to-date value.
4581          */
4582         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
4583                 panic("handle_written_inodeblock: bad size");
4584         if (fstype == UFS1) {
4585                 if (dp1->di_size != inodedep->id_savedsize) {
4586                         dp1->di_size = inodedep->id_savedsize;
4587                         hadchanges = 1;
4588                 }
4589         } else {
4590                 if (dp2->di_size != inodedep->id_savedsize) {
4591                         dp2->di_size = inodedep->id_savedsize;
4592                         hadchanges = 1;
4593                 }
4594                 if (dp2->di_extsize != inodedep->id_savedextsize) {
4595                         dp2->di_extsize = inodedep->id_savedextsize;
4596                         hadchanges = 1;
4597                 }
4598         }
4599         inodedep->id_savedsize = -1;
4600         inodedep->id_savedextsize = -1;
4601         /*
4602          * If there were any rollbacks in the inode block, then it must be
4603          * marked dirty so that its will eventually get written back in
4604          * its correct form.
4605          */
4606         if (hadchanges)
4607                 bdirty(bp);
4608         /*
4609          * Process any allocdirects that completed during the update.
4610          */
4611         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4612                 handle_allocdirect_partdone(adp);
4613         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
4614                 handle_allocdirect_partdone(adp);
4615         /*
4616          * Process deallocations that were held pending until the
4617          * inode had been written to disk. Freeing of the inode
4618          * is delayed until after all blocks have been freed to
4619          * avoid creation of new <vfsid, inum, lbn> triples
4620          * before the old ones have been deleted.
4621          */
4622         filefree = NULL;
4623         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4624                 WORKLIST_REMOVE(wk);
4625                 switch (wk->wk_type) {
4626
4627                 case D_FREEFILE:
4628                         /*
4629                          * We defer adding filefree to the worklist until
4630                          * all other additions have been made to ensure
4631                          * that it will be done after all the old blocks
4632                          * have been freed.
4633                          */
4634                         if (filefree != NULL)
4635                                 panic("handle_written_inodeblock: filefree");
4636                         filefree = wk;
4637                         continue;
4638
4639                 case D_MKDIR:
4640                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4641                         continue;
4642
4643                 case D_DIRADD:
4644                         diradd_inode_written(WK_DIRADD(wk), inodedep);
4645                         continue;
4646
4647                 case D_FREEBLKS:
4648                         wk->wk_state |= COMPLETE;
4649                         if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
4650                                 continue;
4651                          /* -- fall through -- */
4652                 case D_FREEFRAG:
4653                 case D_DIRREM:
4654                         add_to_worklist(wk);
4655                         continue;
4656
4657                 case D_NEWDIRBLK:
4658                         free_newdirblk(WK_NEWDIRBLK(wk));
4659                         continue;
4660
4661                 default:
4662                         panic("handle_written_inodeblock: Unknown type %s",
4663                             TYPENAME(wk->wk_type));
4664                         /* NOTREACHED */
4665                 }
4666         }
4667         if (filefree != NULL) {
4668                 if (free_inodedep(inodedep) == 0)
4669                         panic("handle_written_inodeblock: live inodedep");
4670                 add_to_worklist(filefree);
4671                 return (0);
4672         }
4673
4674         /*
4675          * If no outstanding dependencies, free it.
4676          */
4677         if (free_inodedep(inodedep) ||
4678             (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
4679              TAILQ_FIRST(&inodedep->id_extupdt) == 0))
4680                 return (0);
4681         return (hadchanges);
4682 }
4683
4684 /*
4685  * Process a diradd entry after its dependent inode has been written.
4686  * This routine must be called with splbio interrupts blocked.
4687  */
4688 static void
4689 diradd_inode_written(dap, inodedep)
4690         struct diradd *dap;
4691         struct inodedep *inodedep;
4692 {
4693         struct pagedep *pagedep;
4694
4695         dap->da_state |= COMPLETE;
4696         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4697                 if (dap->da_state & DIRCHG)
4698                         pagedep = dap->da_previous->dm_pagedep;
4699                 else
4700                         pagedep = dap->da_pagedep;
4701                 LIST_REMOVE(dap, da_pdlist);
4702                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4703         }
4704         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4705 }
4706
4707 /*
4708  * Handle the completion of a mkdir dependency.
4709  */
4710 static void
4711 handle_written_mkdir(mkdir, type)
4712         struct mkdir *mkdir;
4713         int type;
4714 {
4715         struct diradd *dap;
4716         struct pagedep *pagedep;
4717
4718         if (mkdir->md_state != type)
4719                 panic("handle_written_mkdir: bad type");
4720         dap = mkdir->md_diradd;
4721         dap->da_state &= ~type;
4722         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4723                 dap->da_state |= DEPCOMPLETE;
4724         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4725                 if (dap->da_state & DIRCHG)
4726                         pagedep = dap->da_previous->dm_pagedep;
4727                 else
4728                         pagedep = dap->da_pagedep;
4729                 LIST_REMOVE(dap, da_pdlist);
4730                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4731         }
4732         LIST_REMOVE(mkdir, md_mkdirs);
4733         WORKITEM_FREE(mkdir, D_MKDIR);
4734 }
4735
4736 /*
4737  * Called from within softdep_disk_write_complete above.
4738  * A write operation was just completed. Removed inodes can
4739  * now be freed and associated block pointers may be committed.
4740  * Note that this routine is always called from interrupt level
4741  * with further splbio interrupts blocked.
4742  */
4743 static int
4744 handle_written_filepage(pagedep, bp)
4745         struct pagedep *pagedep;
4746         struct buf *bp;         /* buffer containing the written page */
4747 {
4748         struct dirrem *dirrem;
4749         struct diradd *dap, *nextdap;
4750         struct direct *ep;
4751         int i, chgs;
4752
4753         if ((pagedep->pd_state & IOSTARTED) == 0)
4754                 panic("handle_written_filepage: not started");
4755         pagedep->pd_state &= ~IOSTARTED;
4756         /*
4757          * Process any directory removals that have been committed.
4758          */
4759         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4760                 LIST_REMOVE(dirrem, dm_next);
4761                 dirrem->dm_dirinum = pagedep->pd_ino;
4762                 add_to_worklist(&dirrem->dm_list);
4763         }
4764         /*
4765          * Free any directory additions that have been committed.
4766          * If it is a newly allocated block, we have to wait until
4767          * the on-disk directory inode claims the new block.
4768          */
4769         if ((pagedep->pd_state & NEWBLOCK) == 0)
4770                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4771                         free_diradd(dap);
4772         /*
4773          * Uncommitted directory entries must be restored.
4774          */
4775         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4776                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4777                      dap = nextdap) {
4778                         nextdap = LIST_NEXT(dap, da_pdlist);
4779                         if (dap->da_state & ATTACHED)
4780                                 panic("handle_written_filepage: attached");
4781                         ep = (struct direct *)
4782                             ((char *)bp->b_data + dap->da_offset);
4783                         ep->d_ino = dap->da_newinum;
4784                         dap->da_state &= ~UNDONE;
4785                         dap->da_state |= ATTACHED;
4786                         chgs = 1;
4787                         /*
4788                          * If the inode referenced by the directory has
4789                          * been written out, then the dependency can be
4790                          * moved to the pending list.
4791                          */
4792                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4793                                 LIST_REMOVE(dap, da_pdlist);
4794                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4795                                     da_pdlist);
4796                         }
4797                 }
4798         }
4799         /*
4800          * If there were any rollbacks in the directory, then it must be
4801          * marked dirty so that its will eventually get written back in
4802          * its correct form.
4803          */
4804         if (chgs) {
4805                 if ((bp->b_flags & B_DELWRI) == 0)
4806                         stat_dir_entry++;
4807                 bdirty(bp);
4808                 return (1);
4809         }
4810         /*
4811          * If we are not waiting for a new directory block to be
4812          * claimed by its inode, then the pagedep will be freed.
4813          * Otherwise it will remain to track any new entries on
4814          * the page in case they are fsync'ed.
4815          */
4816         if ((pagedep->pd_state & NEWBLOCK) == 0) {
4817                 LIST_REMOVE(pagedep, pd_hash);
4818                 WORKITEM_FREE(pagedep, D_PAGEDEP);
4819         }
4820         return (0);
4821 }
4822
4823 /*
4824  * Writing back in-core inode structures.
4825  *
4826  * The filesystem only accesses an inode's contents when it occupies an
4827  * "in-core" inode structure.  These "in-core" structures are separate from
4828  * the page frames used to cache inode blocks.  Only the latter are
4829  * transferred to/from the disk.  So, when the updated contents of the
4830  * "in-core" inode structure are copied to the corresponding in-memory inode
4831  * block, the dependencies are also transferred.  The following procedure is
4832  * called when copying a dirty "in-core" inode to a cached inode block.
4833  */
4834
4835 /*
4836  * Called when an inode is loaded from disk. If the effective link count
4837  * differed from the actual link count when it was last flushed, then we
4838  * need to ensure that the correct effective link count is put back.
4839  */
4840 void
4841 softdep_load_inodeblock(ip)
4842         struct inode *ip;       /* the "in_core" copy of the inode */
4843 {
4844         struct inodedep *inodedep;
4845
4846         /*
4847          * Check for alternate nlink count.
4848          */
4849         ip->i_effnlink = ip->i_nlink;
4850         ACQUIRE_LOCK(&lk);
4851         if (inodedep_lookup(UFSTOVFS(ip->i_ump),
4852             ip->i_number, 0, &inodedep) == 0) {
4853                 FREE_LOCK(&lk);
4854                 return;
4855         }
4856         ip->i_effnlink -= inodedep->id_nlinkdelta;
4857         if (inodedep->id_state & SPACECOUNTED)
4858                 ip->i_flag |= IN_SPACECOUNTED;
4859         FREE_LOCK(&lk);
4860 }
4861
4862 /*
4863  * This routine is called just before the "in-core" inode
4864  * information is to be copied to the in-memory inode block.
4865  * Recall that an inode block contains several inodes. If
4866  * the force flag is set, then the dependencies will be
4867  * cleared so that the update can always be made. Note that
4868  * the buffer is locked when this routine is called, so we
4869  * will never be in the middle of writing the inode block
4870  * to disk.
4871  */
4872 void
4873 softdep_update_inodeblock(ip, bp, waitfor)
4874         struct inode *ip;       /* the "in_core" copy of the inode */
4875         struct buf *bp;         /* the buffer containing the inode block */
4876         int waitfor;            /* nonzero => update must be allowed */
4877 {
4878         struct inodedep *inodedep;
4879         struct worklist *wk;
4880         struct mount *mp;
4881         struct buf *ibp;
4882         int error;
4883
4884         /*
4885          * If the effective link count is not equal to the actual link
4886          * count, then we must track the difference in an inodedep while
4887          * the inode is (potentially) tossed out of the cache. Otherwise,
4888          * if there is no existing inodedep, then there are no dependencies
4889          * to track.
4890          */
4891         mp = UFSTOVFS(ip->i_ump);
4892         ACQUIRE_LOCK(&lk);
4893         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
4894                 FREE_LOCK(&lk);
4895                 if (ip->i_effnlink != ip->i_nlink)
4896                         panic("softdep_update_inodeblock: bad link count");
4897                 return;
4898         }
4899         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
4900                 panic("softdep_update_inodeblock: bad delta");
4901         /*
4902          * Changes have been initiated. Anything depending on these
4903          * changes cannot occur until this inode has been written.
4904          */
4905         inodedep->id_state &= ~COMPLETE;
4906         if ((inodedep->id_state & ONWORKLIST) == 0)
4907                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4908         /*
4909          * Any new dependencies associated with the incore inode must
4910          * now be moved to the list associated with the buffer holding
4911          * the in-memory copy of the inode. Once merged process any
4912          * allocdirects that are completed by the merger.
4913          */
4914         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
4915         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
4916                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4917         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
4918         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
4919                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
4920         /*
4921          * Now that the inode has been pushed into the buffer, the
4922          * operations dependent on the inode being written to disk
4923          * can be moved to the id_bufwait so that they will be
4924          * processed when the buffer I/O completes.
4925          */
4926         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4927                 WORKLIST_REMOVE(wk);
4928                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4929         }
4930         /*
4931          * Newly allocated inodes cannot be written until the bitmap
4932          * that allocates them have been written (indicated by
4933          * DEPCOMPLETE being set in id_state). If we are doing a
4934          * forced sync (e.g., an fsync on a file), we force the bitmap
4935          * to be written so that the update can be done.
4936          */
4937         if (waitfor == 0) {
4938                 FREE_LOCK(&lk);
4939                 return;
4940         }
4941 retry:
4942         if ((inodedep->id_state & DEPCOMPLETE) != 0) {
4943                 FREE_LOCK(&lk);
4944                 return;
4945         }
4946         ibp = inodedep->id_buf;
4947         ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
4948         if (ibp == NULL) {
4949                 /*
4950                  * If ibp came back as NULL, the dependency could have been
4951                  * freed while we slept.  Look it up again, and check to see
4952                  * that it has completed.
4953                  */
4954                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
4955                         goto retry;
4956                 FREE_LOCK(&lk);
4957                 return;
4958         }
4959         FREE_LOCK(&lk);
4960         if ((error = bwrite(ibp)) != 0)
4961                 softdep_error("softdep_update_inodeblock: bwrite", error);
4962 }
4963
4964 /*
4965  * Merge the a new inode dependency list (such as id_newinoupdt) into an
4966  * old inode dependency list (such as id_inoupdt). This routine must be
4967  * called with splbio interrupts blocked.
4968  */
4969 static void
4970 merge_inode_lists(newlisthead, oldlisthead)
4971         struct allocdirectlst *newlisthead;
4972         struct allocdirectlst *oldlisthead;
4973 {
4974         struct allocdirect *listadp, *newadp;
4975
4976         newadp = TAILQ_FIRST(newlisthead);
4977         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
4978                 if (listadp->ad_lbn < newadp->ad_lbn) {
4979                         listadp = TAILQ_NEXT(listadp, ad_next);
4980                         continue;
4981                 }
4982                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
4983                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4984                 if (listadp->ad_lbn == newadp->ad_lbn) {
4985                         allocdirect_merge(oldlisthead, newadp,
4986                             listadp);
4987                         listadp = newadp;
4988                 }
4989                 newadp = TAILQ_FIRST(newlisthead);
4990         }
4991         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
4992                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
4993                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
4994         }
4995 }
4996
4997 /*
4998  * If we are doing an fsync, then we must ensure that any directory
4999  * entries for the inode have been written after the inode gets to disk.
5000  */
5001 int
5002 softdep_fsync(vp)
5003         struct vnode *vp;       /* the "in_core" copy of the inode */
5004 {
5005         struct inodedep *inodedep;
5006         struct pagedep *pagedep;
5007         struct worklist *wk;
5008         struct diradd *dap;
5009         struct mount *mp;
5010         struct vnode *pvp;
5011         struct inode *ip;
5012         struct buf *bp;
5013         struct fs *fs;
5014         struct thread *td = curthread;
5015         int error, flushparent, pagedep_new_block;
5016         ino_t parentino;
5017         ufs_lbn_t lbn;
5018
5019         ip = VTOI(vp);
5020         fs = ip->i_fs;
5021         mp = vp->v_mount;
5022         ACQUIRE_LOCK(&lk);
5023         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
5024                 FREE_LOCK(&lk);
5025                 return (0);
5026         }
5027         if (!LIST_EMPTY(&inodedep->id_inowait) ||
5028             !LIST_EMPTY(&inodedep->id_bufwait) ||
5029             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5030             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5031             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5032             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
5033                 panic("softdep_fsync: pending ops");
5034         for (error = 0, flushparent = 0; ; ) {
5035                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
5036                         break;
5037                 if (wk->wk_type != D_DIRADD)
5038                         panic("softdep_fsync: Unexpected type %s",
5039                             TYPENAME(wk->wk_type));
5040                 dap = WK_DIRADD(wk);
5041                 /*
5042                  * Flush our parent if this directory entry has a MKDIR_PARENT
5043                  * dependency or is contained in a newly allocated block.
5044                  */
5045                 if (dap->da_state & DIRCHG)
5046                         pagedep = dap->da_previous->dm_pagedep;
5047                 else
5048                         pagedep = dap->da_pagedep;
5049                 parentino = pagedep->pd_ino;
5050                 lbn = pagedep->pd_lbn;
5051                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
5052                         panic("softdep_fsync: dirty");
5053                 if ((dap->da_state & MKDIR_PARENT) ||
5054                     (pagedep->pd_state & NEWBLOCK))
5055                         flushparent = 1;
5056                 else
5057                         flushparent = 0;
5058                 /*
5059                  * If we are being fsync'ed as part of vgone'ing this vnode,
5060                  * then we will not be able to release and recover the
5061                  * vnode below, so we just have to give up on writing its
5062                  * directory entry out. It will eventually be written, just
5063                  * not now, but then the user was not asking to have it
5064                  * written, so we are not breaking any promises.
5065                  */
5066                 if (vp->v_iflag & VI_DOOMED)
5067                         break;
5068                 /*
5069                  * We prevent deadlock by always fetching inodes from the
5070                  * root, moving down the directory tree. Thus, when fetching
5071                  * our parent directory, we first try to get the lock. If
5072                  * that fails, we must unlock ourselves before requesting
5073                  * the lock on our parent. See the comment in ufs_lookup
5074                  * for details on possible races.
5075                  */
5076                 FREE_LOCK(&lk);
5077                 if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
5078                         VOP_UNLOCK(vp, 0, td);
5079                         error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
5080                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
5081                         if (error != 0)
5082                                 return (error);
5083                 }
5084                 /*
5085                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
5086                  * that are contained in direct blocks will be resolved by
5087                  * doing a ffs_update. Pagedeps contained in indirect blocks
5088                  * may require a complete sync'ing of the directory. So, we
5089                  * try the cheap and fast ffs_update first, and if that fails,
5090                  * then we do the slower ffs_syncvnode of the directory.
5091                  */
5092                 if (flushparent) {
5093                         int locked;
5094
5095                         if ((error = ffs_update(pvp, 1)) != 0) {
5096                                 vput(pvp);
5097                                 return (error);
5098                         }
5099                         ACQUIRE_LOCK(&lk);
5100                         locked = 1;
5101                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
5102                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
5103                                         if (wk->wk_type != D_DIRADD)
5104                                                 panic("softdep_fsync: Unexpected type %s",
5105                                                       TYPENAME(wk->wk_type));
5106                                         dap = WK_DIRADD(wk);
5107                                         if (dap->da_state & DIRCHG)
5108                                                 pagedep = dap->da_previous->dm_pagedep;
5109                                         else
5110                                                 pagedep = dap->da_pagedep;
5111                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
5112                                         FREE_LOCK(&lk);
5113                                         locked = 0;
5114                                         if (pagedep_new_block &&
5115                                             (error = ffs_syncvnode(pvp, MNT_WAIT))) {
5116                                                 vput(pvp);
5117                                                 return (error);
5118                                         }
5119                                 }
5120                         }
5121                         if (locked)
5122                                 FREE_LOCK(&lk);
5123                 }
5124                 /*
5125                  * Flush directory page containing the inode's name.
5126                  */
5127                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
5128                     &bp);
5129                 if (error == 0)
5130                         error = bwrite(bp);
5131                 else
5132                         brelse(bp);
5133                 vput(pvp);
5134                 if (error != 0)
5135                         return (error);
5136                 ACQUIRE_LOCK(&lk);
5137                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
5138                         break;
5139         }
5140         FREE_LOCK(&lk);
5141         return (0);
5142 }
5143
5144 /*
5145  * Flush all the dirty bitmaps associated with the block device
5146  * before flushing the rest of the dirty blocks so as to reduce
5147  * the number of dependencies that will have to be rolled back.
5148  */
5149 void
5150 softdep_fsync_mountdev(vp)
5151         struct vnode *vp;
5152 {
5153         struct buf *bp, *nbp;
5154         struct worklist *wk;
5155
5156         if (!vn_isdisk(vp, NULL))
5157                 panic("softdep_fsync_mountdev: vnode not a disk");
5158 restart:
5159         ACQUIRE_LOCK(&lk);
5160         VI_LOCK(vp);
5161         TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
5162                 /*
5163                  * If it is already scheduled, skip to the next buffer.
5164                  */
5165                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
5166                         continue;
5167
5168                 if ((bp->b_flags & B_DELWRI) == 0)
5169                         panic("softdep_fsync_mountdev: not dirty");
5170                 /*
5171                  * We are only interested in bitmaps with outstanding
5172                  * dependencies.
5173                  */
5174                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
5175                     wk->wk_type != D_BMSAFEMAP ||
5176                     (bp->b_vflags & BV_BKGRDINPROG)) {
5177                         BUF_UNLOCK(bp);
5178                         continue;
5179                 }
5180                 VI_UNLOCK(vp);
5181                 FREE_LOCK(&lk);
5182                 bremfree(bp);
5183                 (void) bawrite(bp);
5184                 goto restart;
5185         }
5186         FREE_LOCK(&lk);
5187         drain_output(vp);
5188         VI_UNLOCK(vp);
5189 }
5190
5191 /*
5192  * This routine is called when we are trying to synchronously flush a
5193  * file. This routine must eliminate any filesystem metadata dependencies
5194  * so that the syncing routine can succeed by pushing the dirty blocks
5195  * associated with the file. If any I/O errors occur, they are returned.
5196  */
5197 int
5198 softdep_sync_metadata(struct vnode *vp)
5199 {
5200         struct pagedep *pagedep;
5201         struct allocdirect *adp;
5202         struct allocindir *aip;
5203         struct buf *bp, *nbp;
5204         struct worklist *wk;
5205         int i, error, waitfor;
5206
5207         if (!DOINGSOFTDEP(vp))
5208                 return (0);
5209         /*
5210          * Ensure that any direct block dependencies have been cleared.
5211          */
5212         ACQUIRE_LOCK(&lk);
5213         if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
5214                 FREE_LOCK(&lk);
5215                 return (error);
5216         }
5217         FREE_LOCK(&lk);
5218         /*
5219          * For most files, the only metadata dependencies are the
5220          * cylinder group maps that allocate their inode or blocks.
5221          * The block allocation dependencies can be found by traversing
5222          * the dependency lists for any buffers that remain on their
5223          * dirty buffer list. The inode allocation dependency will
5224          * be resolved when the inode is updated with MNT_WAIT.
5225          * This work is done in two passes. The first pass grabs most
5226          * of the buffers and begins asynchronously writing them. The
5227          * only way to wait for these asynchronous writes is to sleep
5228          * on the filesystem vnode which may stay busy for a long time
5229          * if the filesystem is active. So, instead, we make a second
5230          * pass over the dependencies blocking on each write. In the
5231          * usual case we will be blocking against a write that we
5232          * initiated, so when it is done the dependency will have been
5233          * resolved. Thus the second pass is expected to end quickly.
5234          */
5235         waitfor = MNT_NOWAIT;
5236
5237 top:
5238         /*
5239          * We must wait for any I/O in progress to finish so that
5240          * all potential buffers on the dirty list will be visible.
5241          */
5242         VI_LOCK(vp);
5243         drain_output(vp);
5244         while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
5245                 bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
5246                 if (bp)
5247                         break;
5248         }
5249         VI_UNLOCK(vp);
5250         if (bp == NULL)
5251                 return (0);
5252 loop:
5253         /* While syncing snapshots, we must allow recursive lookups */
5254         bp->b_lock.lk_flags |= LK_CANRECURSE;
5255         ACQUIRE_LOCK(&lk);
5256         /*
5257          * As we hold the buffer locked, none of its dependencies
5258          * will disappear.
5259          */
5260         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5261                 switch (wk->wk_type) {
5262
5263                 case D_ALLOCDIRECT:
5264                         adp = WK_ALLOCDIRECT(wk);
5265                         if (adp->ad_state & DEPCOMPLETE)
5266                                 continue;
5267                         nbp = adp->ad_buf;
5268                         nbp = getdirtybuf(nbp, &lk, waitfor);
5269                         if (nbp == NULL)
5270                                 continue;
5271                         FREE_LOCK(&lk);
5272                         if (waitfor == MNT_NOWAIT) {
5273                                 bawrite(nbp);
5274                         } else if ((error = bwrite(nbp)) != 0) {
5275                                 break;
5276                         }
5277                         ACQUIRE_LOCK(&lk);
5278                         continue;
5279
5280                 case D_ALLOCINDIR:
5281                         aip = WK_ALLOCINDIR(wk);
5282                         if (aip->ai_state & DEPCOMPLETE)
5283                                 continue;
5284                         nbp = aip->ai_buf;
5285                         nbp = getdirtybuf(nbp, &lk, waitfor);
5286                         if (nbp == NULL)
5287                                 continue;
5288                         FREE_LOCK(&lk);
5289                         if (waitfor == MNT_NOWAIT) {
5290                                 bawrite(nbp);
5291                         } else if ((error = bwrite(nbp)) != 0) {
5292                                 break;
5293                         }
5294                         ACQUIRE_LOCK(&lk);
5295                         continue;
5296
5297                 case D_INDIRDEP:
5298                 restart:
5299
5300                         LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
5301                                 if (aip->ai_state & DEPCOMPLETE)
5302                                         continue;
5303                                 nbp = aip->ai_buf;
5304                                 nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
5305                                 if (nbp == NULL)
5306                                         goto restart;
5307                                 FREE_LOCK(&lk);
5308                                 if ((error = bwrite(nbp)) != 0) {
5309                                         goto loop_end;
5310                                 }
5311                                 ACQUIRE_LOCK(&lk);
5312                                 goto restart;
5313                         }
5314                         continue;
5315
5316                 case D_INODEDEP:
5317                         if ((error = flush_inodedep_deps(wk->wk_mp,
5318                             WK_INODEDEP(wk)->id_ino)) != 0) {
5319                                 FREE_LOCK(&lk);
5320                                 break;
5321                         }
5322                         continue;
5323
5324                 case D_PAGEDEP:
5325                         /*
5326                          * We are trying to sync a directory that may
5327                          * have dependencies on both its own metadata
5328                          * and/or dependencies on the inodes of any
5329                          * recently allocated files. We walk its diradd
5330                          * lists pushing out the associated inode.
5331                          */
5332                         pagedep = WK_PAGEDEP(wk);
5333                         for (i = 0; i < DAHASHSZ; i++) {
5334                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
5335                                         continue;
5336                                 if ((error =
5337                                     flush_pagedep_deps(vp, wk->wk_mp,
5338                                                 &pagedep->pd_diraddhd[i]))) {
5339                                         FREE_LOCK(&lk);
5340                                         goto loop_end;
5341                                 }
5342                         }
5343                         continue;
5344
5345                 case D_MKDIR:
5346                         /*
5347                          * This case should never happen if the vnode has
5348                          * been properly sync'ed. However, if this function
5349                          * is used at a place where the vnode has not yet
5350                          * been sync'ed, this dependency can show up. So,
5351                          * rather than panic, just flush it.
5352                          */
5353                         nbp = WK_MKDIR(wk)->md_buf;
5354                         nbp = getdirtybuf(nbp, &lk, waitfor);
5355                         if (nbp == NULL)
5356                                 continue;
5357                         FREE_LOCK(&lk);
5358                         if (waitfor == MNT_NOWAIT) {
5359                                 bawrite(nbp);
5360                         } else if ((error = bwrite(nbp)) != 0) {
5361                                 break;
5362                         }
5363                         ACQUIRE_LOCK(&lk);
5364                         continue;
5365
5366                 case D_BMSAFEMAP:
5367                         /*
5368                          * This case should never happen if the vnode has
5369                          * been properly sync'ed. However, if this function
5370                          * is used at a place where the vnode has not yet
5371                          * been sync'ed, this dependency can show up. So,
5372                          * rather than panic, just flush it.
5373                          */
5374                         nbp = WK_BMSAFEMAP(wk)->sm_buf;
5375                         nbp = getdirtybuf(nbp, &lk, waitfor);
5376                         if (nbp == NULL)
5377                                 continue;
5378                         FREE_LOCK(&lk);
5379                         if (waitfor == MNT_NOWAIT) {
5380                                 bawrite(nbp);
5381                         } else if ((error = bwrite(nbp)) != 0) {
5382                                 break;
5383                         }
5384                         ACQUIRE_LOCK(&lk);
5385                         continue;
5386
5387                 default:
5388                         panic("softdep_sync_metadata: Unknown type %s",
5389                             TYPENAME(wk->wk_type));
5390                         /* NOTREACHED */
5391                 }
5392         loop_end:
5393                 /* We reach here only in error and unlocked */
5394                 if (error == 0)
5395                         panic("softdep_sync_metadata: zero error");
5396                 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5397                 bawrite(bp);
5398                 return (error);
5399         }
5400         FREE_LOCK(&lk);
5401         VI_LOCK(vp);
5402         while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
5403                 nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
5404                 if (nbp)
5405                         break;
5406         }
5407         VI_UNLOCK(vp);
5408         bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5409         bawrite(bp);
5410         if (nbp != NULL) {
5411                 bp = nbp;
5412                 goto loop;
5413         }
5414         /*
5415          * The brief unlock is to allow any pent up dependency
5416          * processing to be done. Then proceed with the second pass.
5417          */
5418         if (waitfor == MNT_NOWAIT) {
5419                 waitfor = MNT_WAIT;
5420                 goto top;
5421         }
5422
5423         /*
5424          * If we have managed to get rid of all the dirty buffers,
5425          * then we are done. For certain directories and block
5426          * devices, we may need to do further work.
5427          *
5428          * We must wait for any I/O in progress to finish so that
5429          * all potential buffers on the dirty list will be visible.
5430          */
5431         VI_LOCK(vp);
5432         drain_output(vp);
5433         VI_UNLOCK(vp);
5434         return (0);
5435 }
5436
5437 /*
5438  * Flush the dependencies associated with an inodedep.
5439  * Called with splbio blocked.
5440  */
5441 static int
5442 flush_inodedep_deps(mp, ino)
5443         struct mount *mp;
5444         ino_t ino;
5445 {
5446         struct inodedep *inodedep;
5447         int error, waitfor;
5448
5449         /*
5450          * This work is done in two passes. The first pass grabs most
5451          * of the buffers and begins asynchronously writing them. The
5452          * only way to wait for these asynchronous writes is to sleep
5453          * on the filesystem vnode which may stay busy for a long time
5454          * if the filesystem is active. So, instead, we make a second
5455          * pass over the dependencies blocking on each write. In the
5456          * usual case we will be blocking against a write that we
5457          * initiated, so when it is done the dependency will have been
5458          * resolved. Thus the second pass is expected to end quickly.
5459          * We give a brief window at the top of the loop to allow
5460          * any pending I/O to complete.
5461          */
5462         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
5463                 if (error)
5464                         return (error);
5465                 FREE_LOCK(&lk);
5466                 ACQUIRE_LOCK(&lk);
5467                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5468                         return (0);
5469                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
5470                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
5471                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
5472                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
5473                         continue;
5474                 /*
5475                  * If pass2, we are done, otherwise do pass 2.
5476                  */
5477                 if (waitfor == MNT_WAIT)
5478                         break;
5479                 waitfor = MNT_WAIT;
5480         }
5481         /*
5482          * Try freeing inodedep in case all dependencies have been removed.
5483          */
5484         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
5485                 (void) free_inodedep(inodedep);
5486         return (0);
5487 }
5488
5489 /*
5490  * Flush an inode dependency list.
5491  * Called with splbio blocked.
5492  */
5493 static int
5494 flush_deplist(listhead, waitfor, errorp)
5495         struct allocdirectlst *listhead;
5496         int waitfor;
5497         int *errorp;
5498 {
5499         struct allocdirect *adp;
5500         struct buf *bp;
5501
5502         mtx_assert(&lk, MA_OWNED);
5503         TAILQ_FOREACH(adp, listhead, ad_next) {
5504                 if (adp->ad_state & DEPCOMPLETE)
5505                         continue;
5506                 bp = adp->ad_buf;
5507                 bp = getdirtybuf(bp, &lk, waitfor);
5508                 if (bp == NULL) {
5509                         if (waitfor == MNT_NOWAIT)
5510                                 continue;
5511                         return (1);
5512                 }
5513                 FREE_LOCK(&lk);
5514                 if (waitfor == MNT_NOWAIT) {
5515                         bawrite(bp);
5516                 } else if ((*errorp = bwrite(bp)) != 0) {
5517                         ACQUIRE_LOCK(&lk);
5518                         return (1);
5519                 }
5520                 ACQUIRE_LOCK(&lk);
5521                 return (1);
5522         }
5523         return (0);
5524 }
5525
5526 /*
5527  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5528  * Called with splbio blocked.
5529  */
5530 static int
5531 flush_pagedep_deps(pvp, mp, diraddhdp)
5532         struct vnode *pvp;
5533         struct mount *mp;
5534         struct diraddhd *diraddhdp;
5535 {
5536         struct inodedep *inodedep;
5537         struct ufsmount *ump;
5538         struct diradd *dap;
5539         struct vnode *vp;
5540         int error = 0;
5541         struct buf *bp;
5542         ino_t inum;
5543         struct worklist *wk;
5544
5545         ump = VFSTOUFS(mp);
5546         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5547                 /*
5548                  * Flush ourselves if this directory entry
5549                  * has a MKDIR_PARENT dependency.
5550                  */
5551                 if (dap->da_state & MKDIR_PARENT) {
5552                         FREE_LOCK(&lk);
5553                         if ((error = ffs_update(pvp, 1)) != 0)
5554                                 break;
5555                         ACQUIRE_LOCK(&lk);
5556                         /*
5557                          * If that cleared dependencies, go on to next.
5558                          */
5559                         if (dap != LIST_FIRST(diraddhdp))
5560                                 continue;
5561                         if (dap->da_state & MKDIR_PARENT)
5562                                 panic("flush_pagedep_deps: MKDIR_PARENT");
5563                 }
5564                 /*
5565                  * A newly allocated directory must have its "." and
5566                  * ".." entries written out before its name can be
5567                  * committed in its parent. We do not want or need
5568                  * the full semantics of a synchronous ffs_syncvnode as
5569                  * that may end up here again, once for each directory
5570                  * level in the filesystem. Instead, we push the blocks
5571                  * and wait for them to clear. We have to fsync twice
5572                  * because the first call may choose to defer blocks
5573                  * that still have dependencies, but deferral will
5574                  * happen at most once.
5575                  */
5576                 inum = dap->da_newinum;
5577                 if (dap->da_state & MKDIR_BODY) {
5578                         FREE_LOCK(&lk);
5579                         if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
5580                                 break;
5581                         if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
5582                             (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
5583                                 vput(vp);
5584                                 break;
5585                         }
5586                         VI_LOCK(vp);
5587                         drain_output(vp);
5588                         /*
5589                          * If first block is still dirty with a D_MKDIR
5590                          * dependency then it needs to be written now.
5591                          */
5592                         for (;;) {
5593                                 error = 0;
5594                                 bp = gbincore(&vp->v_bufobj, 0);
5595                                 if (bp == NULL)
5596                                         break;  /* First block not present */
5597                                 error = BUF_LOCK(bp,
5598                                                  LK_EXCLUSIVE |
5599                                                  LK_SLEEPFAIL |
5600                                                  LK_INTERLOCK,
5601                                                  VI_MTX(vp));
5602                                 VI_LOCK(vp);
5603                                 if (error == ENOLCK)
5604                                         continue;       /* Slept, retry */
5605                                 if (error != 0)
5606                                         break;          /* Failed */
5607                                 if ((bp->b_flags & B_DELWRI) == 0) {
5608                                         BUF_UNLOCK(bp);
5609                                         break;  /* Buffer not dirty */
5610                                 }
5611                                 for (wk = LIST_FIRST(&bp->b_dep);
5612                                      wk != NULL;
5613                                      wk = LIST_NEXT(wk, wk_list))
5614                                         if (wk->wk_type == D_MKDIR)
5615                                                 break;
5616                                 if (wk == NULL)
5617                                         BUF_UNLOCK(bp); /* Dependency gone */
5618                                 else {
5619                                         /*
5620                                          * D_MKDIR dependency remains,
5621                                          * must write buffer to stable
5622                                          * storage.
5623                                          */
5624                                         VI_UNLOCK(vp);
5625                                         bremfree(bp);
5626                                         error = bwrite(bp);
5627                                         VI_LOCK(vp);
5628                                 }
5629                                 break;
5630                         }
5631                         VI_UNLOCK(vp);
5632                         vput(vp);
5633                         if (error != 0)
5634                                 break;  /* Flushing of first block failed */
5635                         ACQUIRE_LOCK(&lk);
5636                         /*
5637                          * If that cleared dependencies, go on to next.
5638                          */
5639                         if (dap != LIST_FIRST(diraddhdp))
5640                                 continue;
5641                         if (dap->da_state & MKDIR_BODY)
5642                                 panic("flush_pagedep_deps: MKDIR_BODY");
5643                 }
5644                 /*
5645                  * Flush the inode on which the directory entry depends.
5646                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5647                  * the only remaining dependency is that the updated inode
5648                  * count must get pushed to disk. The inode has already
5649                  * been pushed into its inode buffer (via VOP_UPDATE) at
5650                  * the time of the reference count change. So we need only
5651                  * locate that buffer, ensure that there will be no rollback
5652                  * caused by a bitmap dependency, then write the inode buffer.
5653                  */
5654 retry:
5655                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
5656                         panic("flush_pagedep_deps: lost inode");
5657                 /*
5658                  * If the inode still has bitmap dependencies,
5659                  * push them to disk.
5660                  */
5661                 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5662                         bp = inodedep->id_buf;
5663                         bp = getdirtybuf(bp, &lk, MNT_WAIT);
5664                         if (bp == NULL)
5665                                 goto retry;
5666                         FREE_LOCK(&lk);
5667                         if ((error = bwrite(bp)) != 0)
5668                                 break;
5669                         ACQUIRE_LOCK(&lk);
5670                         if (dap != LIST_FIRST(diraddhdp))
5671                                 continue;
5672                 }
5673                 /*
5674                  * If the inode is still sitting in a buffer waiting
5675                  * to be written, push it to disk.
5676                  */
5677                 FREE_LOCK(&lk);
5678                 if ((error = bread(ump->um_devvp,
5679                     fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5680                     (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5681                         brelse(bp);
5682                         break;
5683                 }
5684                 if ((error = bwrite(bp)) != 0)
5685                         break;
5686                 ACQUIRE_LOCK(&lk);
5687                 /*
5688                  * If we have failed to get rid of all the dependencies
5689                  * then something is seriously wrong.
5690                  */
5691                 if (dap == LIST_FIRST(diraddhdp))
5692                         panic("flush_pagedep_deps: flush failed");
5693         }
5694         if (error)
5695                 ACQUIRE_LOCK(&lk);
5696         return (error);
5697 }
5698
5699 /*
5700  * A large burst of file addition or deletion activity can drive the
5701  * memory load excessively high. First attempt to slow things down
5702  * using the techniques below. If that fails, this routine requests
5703  * the offending operations to fall back to running synchronously
5704  * until the memory load returns to a reasonable level.
5705  */
5706 int
5707 softdep_slowdown(vp)
5708         struct vnode *vp;
5709 {
5710         int max_softdeps_hard;
5711
5712         ACQUIRE_LOCK(&lk);
5713         max_softdeps_hard = max_softdeps * 11 / 10;
5714         if (num_dirrem < max_softdeps_hard / 2 &&
5715             num_inodedep < max_softdeps_hard &&
5716             VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) {
5717                 FREE_LOCK(&lk);
5718                 return (0);
5719         }
5720         if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
5721                 softdep_speedup();
5722         stat_sync_limit_hit += 1;
5723         FREE_LOCK(&lk);
5724         return (1);
5725 }
5726
5727 /*
5728  * Called by the allocation routines when they are about to fail
5729  * in the hope that we can free up some disk space.
5730  *
5731  * First check to see if the work list has anything on it. If it has,
5732  * clean up entries until we successfully free some space. Because this
5733  * process holds inodes locked, we cannot handle any remove requests
5734  * that might block on a locked inode as that could lead to deadlock.
5735  * If the worklist yields no free space, encourage the syncer daemon
5736  * to help us. In no event will we try for longer than tickdelay seconds.
5737  */
5738 int
5739 softdep_request_cleanup(fs, vp)
5740         struct fs *fs;
5741         struct vnode *vp;
5742 {
5743         struct ufsmount *ump;
5744         long starttime;
5745         ufs2_daddr_t needed;
5746         int error;
5747
5748         ump = VTOI(vp)->i_ump;
5749         mtx_assert(UFS_MTX(ump), MA_OWNED);
5750         needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5751         starttime = time_second + tickdelay;
5752         /*
5753          * If we are being called because of a process doing a
5754          * copy-on-write, then it is not safe to update the vnode
5755          * as we may recurse into the copy-on-write routine.
5756          */
5757         if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
5758                 UFS_UNLOCK(ump);
5759                 error = ffs_update(vp, 1);
5760                 UFS_LOCK(ump);
5761                 if (error != 0)
5762                         return (0);
5763         }
5764         while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5765                 if (time_second > starttime)
5766                         return (0);
5767                 UFS_UNLOCK(ump);
5768                 ACQUIRE_LOCK(&lk);
5769                 if (ump->softdep_on_worklist > 0 &&
5770                     process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
5771                         stat_worklist_push += 1;
5772                         FREE_LOCK(&lk);
5773                         UFS_LOCK(ump);
5774                         continue;
5775                 }
5776                 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
5777                 FREE_LOCK(&lk);
5778                 UFS_LOCK(ump);
5779         }
5780         return (1);
5781 }
5782
5783 /*
5784  * If memory utilization has gotten too high, deliberately slow things
5785  * down and speed up the I/O processing.
5786  */
5787 extern struct thread *syncertd;
5788 static int
5789 request_cleanup(mp, resource)
5790         struct mount *mp;
5791         int resource;
5792 {
5793         struct thread *td = curthread;
5794         struct ufsmount *ump;
5795
5796         mtx_assert(&lk, MA_OWNED);
5797         /*
5798          * We never hold up the filesystem syncer or buf daemon.
5799          */
5800         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
5801                 return (0);
5802         ump = VFSTOUFS(mp);
5803         /*
5804          * First check to see if the work list has gotten backlogged.
5805          * If it has, co-opt this process to help clean up two entries.
5806          * Because this process may hold inodes locked, we cannot
5807          * handle any remove requests that might block on a locked
5808          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
5809          * to avoid recursively processing the worklist.
5810          */
5811         if (ump->softdep_on_worklist > max_softdeps / 10) {
5812                 td->td_pflags |= TDP_SOFTDEP;
5813                 process_worklist_item(mp, LK_NOWAIT);
5814                 process_worklist_item(mp, LK_NOWAIT);
5815                 td->td_pflags &= ~TDP_SOFTDEP;
5816                 stat_worklist_push += 2;
5817                 return(1);
5818         }
5819         /*
5820          * Next, we attempt to speed up the syncer process. If that
5821          * is successful, then we allow the process to continue.
5822          */
5823         if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
5824                 return(0);
5825         /*
5826          * If we are resource constrained on inode dependencies, try
5827          * flushing some dirty inodes. Otherwise, we are constrained
5828          * by file deletions, so try accelerating flushes of directories
5829          * with removal dependencies. We would like to do the cleanup
5830          * here, but we probably hold an inode locked at this point and
5831          * that might deadlock against one that we try to clean. So,
5832          * the best that we can do is request the syncer daemon to do
5833          * the cleanup for us.
5834          */
5835         switch (resource) {
5836
5837         case FLUSH_INODES:
5838                 stat_ino_limit_push += 1;
5839                 req_clear_inodedeps += 1;
5840                 stat_countp = &stat_ino_limit_hit;
5841                 break;
5842
5843         case FLUSH_REMOVE:
5844         case FLUSH_REMOVE_WAIT:
5845                 stat_blk_limit_push += 1;
5846                 req_clear_remove += 1;
5847                 stat_countp = &stat_blk_limit_hit;
5848                 break;
5849
5850         default:
5851                 panic("request_cleanup: unknown type");
5852         }
5853         /*
5854          * Hopefully the syncer daemon will catch up and awaken us.
5855          * We wait at most tickdelay before proceeding in any case.
5856          */
5857         proc_waiting += 1;
5858         if (handle.callout == NULL)
5859                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5860         msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
5861         proc_waiting -= 1;
5862         return (1);
5863 }
5864
5865 /*
5866  * Awaken processes pausing in request_cleanup and clear proc_waiting
5867  * to indicate that there is no longer a timer running.
5868  */
5869 static void
5870 pause_timer(arg)
5871         void *arg;
5872 {
5873
5874         ACQUIRE_LOCK(&lk);
5875         *stat_countp += 1;
5876         wakeup_one(&proc_waiting);
5877         if (proc_waiting > 0)
5878                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5879         else
5880                 handle.callout = NULL;
5881         FREE_LOCK(&lk);
5882 }
5883
5884 /*
5885  * Flush out a directory with at least one removal dependency in an effort to
5886  * reduce the number of dirrem, freefile, and freeblks dependency structures.
5887  */
5888 static void
5889 clear_remove(td)
5890         struct thread *td;
5891 {
5892         struct pagedep_hashhead *pagedephd;
5893         struct pagedep *pagedep;
5894         static int next = 0;
5895         struct mount *mp;
5896         struct vnode *vp;
5897         int error, cnt;
5898         ino_t ino;
5899
5900         mtx_assert(&lk, MA_OWNED);
5901
5902         for (cnt = 0; cnt < pagedep_hash; cnt++) {
5903                 pagedephd = &pagedep_hashtbl[next++];
5904                 if (next >= pagedep_hash)
5905                         next = 0;
5906                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5907                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
5908                                 continue;
5909                         mp = pagedep->pd_list.wk_mp;
5910                         ino = pagedep->pd_ino;
5911                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5912                                 continue;
5913                         FREE_LOCK(&lk);
5914                         if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
5915                                 softdep_error("clear_remove: vget", error);
5916                                 vn_finished_write(mp);
5917                                 ACQUIRE_LOCK(&lk);
5918                                 return;
5919                         }
5920                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5921                                 softdep_error("clear_remove: fsync", error);
5922                         VI_LOCK(vp);
5923                         drain_output(vp);
5924                         VI_UNLOCK(vp);
5925                         vput(vp);
5926                         vn_finished_write(mp);
5927                         ACQUIRE_LOCK(&lk);
5928                         return;
5929                 }
5930         }
5931 }
5932
5933 /*
5934  * Clear out a block of dirty inodes in an effort to reduce
5935  * the number of inodedep dependency structures.
5936  */
5937 static void
5938 clear_inodedeps(td)
5939         struct thread *td;
5940 {
5941         struct inodedep_hashhead *inodedephd;
5942         struct inodedep *inodedep;
5943         static int next = 0;
5944         struct mount *mp;
5945         struct vnode *vp;
5946         struct fs *fs;
5947         int error, cnt;
5948         ino_t firstino, lastino, ino;
5949
5950         mtx_assert(&lk, MA_OWNED);
5951         /*
5952          * Pick a random inode dependency to be cleared.
5953          * We will then gather up all the inodes in its block
5954          * that have dependencies and flush them out.
5955          */
5956         for (cnt = 0; cnt < inodedep_hash; cnt++) {
5957                 inodedephd = &inodedep_hashtbl[next++];
5958                 if (next >= inodedep_hash)
5959                         next = 0;
5960                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5961                         break;
5962         }
5963         if (inodedep == NULL)
5964                 return;
5965         fs = inodedep->id_fs;
5966         mp = inodedep->id_list.wk_mp;
5967         /*
5968          * Find the last inode in the block with dependencies.
5969          */
5970         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5971         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5972                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
5973                         break;
5974         /*
5975          * Asynchronously push all but the last inode with dependencies.
5976          * Synchronously push the last inode with dependencies to ensure
5977          * that the inode block gets written to free up the inodedeps.
5978          */
5979         for (ino = firstino; ino <= lastino; ino++) {
5980                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5981                         continue;
5982                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5983                         continue;
5984                 FREE_LOCK(&lk);
5985                 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
5986                         softdep_error("clear_inodedeps: vget", error);
5987                         vn_finished_write(mp);
5988                         ACQUIRE_LOCK(&lk);
5989                         return;
5990                 }
5991                 if (ino == lastino) {
5992                         if ((error = ffs_syncvnode(vp, MNT_WAIT)))
5993                                 softdep_error("clear_inodedeps: fsync1", error);
5994                 } else {
5995                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5996                                 softdep_error("clear_inodedeps: fsync2", error);
5997                         VI_LOCK(vp);
5998                         drain_output(vp);
5999                         VI_UNLOCK(vp);
6000                 }
6001                 vput(vp);
6002                 vn_finished_write(mp);
6003                 ACQUIRE_LOCK(&lk);
6004         }
6005 }
6006
6007 /*
6008  * Function to determine if the buffer has outstanding dependencies
6009  * that will cause a roll-back if the buffer is written. If wantcount
6010  * is set, return number of dependencies, otherwise just yes or no.
6011  */
6012 static int
6013 softdep_count_dependencies(bp, wantcount)
6014         struct buf *bp;
6015         int wantcount;
6016 {
6017         struct worklist *wk;
6018         struct inodedep *inodedep;
6019         struct indirdep *indirdep;
6020         struct allocindir *aip;
6021         struct pagedep *pagedep;
6022         struct diradd *dap;
6023         int i, retval;
6024
6025         retval = 0;
6026         ACQUIRE_LOCK(&lk);
6027         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6028                 switch (wk->wk_type) {
6029
6030                 case D_INODEDEP:
6031                         inodedep = WK_INODEDEP(wk);
6032                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
6033                                 /* bitmap allocation dependency */
6034                                 retval += 1;
6035                                 if (!wantcount)
6036                                         goto out;
6037                         }
6038                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
6039                                 /* direct block pointer dependency */
6040                                 retval += 1;
6041                                 if (!wantcount)
6042                                         goto out;
6043                         }
6044                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
6045                                 /* direct block pointer dependency */
6046                                 retval += 1;
6047                                 if (!wantcount)
6048                                         goto out;
6049                         }
6050                         continue;
6051
6052                 case D_INDIRDEP:
6053                         indirdep = WK_INDIRDEP(wk);
6054
6055                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
6056                                 /* indirect block pointer dependency */
6057                                 retval += 1;
6058                                 if (!wantcount)
6059                                         goto out;
6060                         }
6061                         continue;
6062
6063                 case D_PAGEDEP:
6064                         pagedep = WK_PAGEDEP(wk);
6065                         for (i = 0; i < DAHASHSZ; i++) {
6066
6067                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
6068                                         /* directory entry dependency */
6069                                         retval += 1;
6070                                         if (!wantcount)
6071                                                 goto out;
6072                                 }
6073                         }
6074                         continue;
6075
6076                 case D_BMSAFEMAP:
6077                 case D_ALLOCDIRECT:
6078                 case D_ALLOCINDIR:
6079                 case D_MKDIR:
6080                         /* never a dependency on these blocks */
6081                         continue;
6082
6083                 default:
6084                         panic("softdep_check_for_rollback: Unexpected type %s",
6085                             TYPENAME(wk->wk_type));
6086                         /* NOTREACHED */
6087                 }
6088         }
6089 out:
6090         FREE_LOCK(&lk);
6091         return retval;
6092 }
6093
6094 /*
6095  * Acquire exclusive access to a buffer.
6096  * Must be called with a locked mtx parameter.
6097  * Return acquired buffer or NULL on failure.
6098  */
6099 static struct buf *
6100 getdirtybuf(bp, mtx, waitfor)
6101         struct buf *bp;
6102         struct mtx *mtx;
6103         int waitfor;
6104 {
6105         int error;
6106
6107         mtx_assert(mtx, MA_OWNED);
6108         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
6109                 if (waitfor != MNT_WAIT)
6110                         return (NULL);
6111                 error = BUF_LOCK(bp,
6112                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
6113                 /*
6114                  * Even if we sucessfully acquire bp here, we have dropped
6115                  * mtx, which may violates our guarantee.
6116                  */
6117                 if (error == 0)
6118                         BUF_UNLOCK(bp);
6119                 else if (error != ENOLCK)
6120                         panic("getdirtybuf: inconsistent lock: %d", error);
6121                 mtx_lock(mtx);
6122                 return (NULL);
6123         }
6124         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6125                 if (mtx == &lk && waitfor == MNT_WAIT) {
6126                         mtx_unlock(mtx);
6127                         BO_LOCK(bp->b_bufobj);
6128                         BUF_UNLOCK(bp);
6129                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6130                                 bp->b_vflags |= BV_BKGRDWAIT;
6131                                 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
6132                                        PRIBIO | PDROP, "getbuf", 0);
6133                         } else
6134                                 BO_UNLOCK(bp->b_bufobj);
6135                         mtx_lock(mtx);
6136                         return (NULL);
6137                 }
6138                 BUF_UNLOCK(bp);
6139                 if (waitfor != MNT_WAIT)
6140                         return (NULL);
6141                 /*
6142                  * The mtx argument must be bp->b_vp's mutex in
6143                  * this case.
6144                  */
6145 #ifdef  DEBUG_VFS_LOCKS
6146                 if (bp->b_vp->v_type != VCHR)
6147                         ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
6148 #endif
6149                 bp->b_vflags |= BV_BKGRDWAIT;
6150                 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
6151                 return (NULL);
6152         }
6153         if ((bp->b_flags & B_DELWRI) == 0) {
6154                 BUF_UNLOCK(bp);
6155                 return (NULL);
6156         }
6157         bremfree(bp);
6158         return (bp);
6159 }
6160
6161
6162 /*
6163  * Check if it is safe to suspend the file system now.  On entry,
6164  * the vnode interlock for devvp should be held.  Return 0 with
6165  * the mount interlock held if the file system can be suspended now,
6166  * otherwise return EAGAIN with the mount interlock held.
6167  */
6168 int
6169 softdep_check_suspend(struct mount *mp,
6170                       struct vnode *devvp,
6171                       int softdep_deps,
6172                       int softdep_accdeps,
6173                       int secondary_writes,
6174                       int secondary_accwrites)
6175 {
6176         struct bufobj *bo;
6177         struct ufsmount *ump;
6178         int error;
6179
6180         ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
6181         ump = VFSTOUFS(mp);
6182         bo = &devvp->v_bufobj;
6183
6184         for (;;) {
6185                 if (!TRY_ACQUIRE_LOCK(&lk)) {
6186                         VI_UNLOCK(devvp);
6187                         ACQUIRE_LOCK(&lk);
6188                         FREE_LOCK(&lk);
6189                         VI_LOCK(devvp);
6190                         continue;
6191                 }
6192                 if (!MNT_ITRYLOCK(mp)) {
6193                         FREE_LOCK(&lk);
6194                         VI_UNLOCK(devvp);
6195                         MNT_ILOCK(mp);
6196                         MNT_IUNLOCK(mp);
6197                         VI_LOCK(devvp);
6198                         continue;
6199                 }
6200                 if (mp->mnt_secondary_writes != 0) {
6201                         FREE_LOCK(&lk);
6202                         VI_UNLOCK(devvp);
6203                         msleep(&mp->mnt_secondary_writes,
6204                                MNT_MTX(mp),
6205                                (PUSER - 1) | PDROP, "secwr", 0);
6206                         VI_LOCK(devvp);
6207                         continue;
6208                 }
6209                 break;
6210         }
6211
6212         /*
6213          * Reasons for needing more work before suspend:
6214          * - Dirty buffers on devvp.
6215          * - Softdep activity occurred after start of vnode sync loop
6216          * - Secondary writes occurred after start of vnode sync loop
6217          */
6218         error = 0;
6219         if (bo->bo_numoutput > 0 ||
6220             bo->bo_dirty.bv_cnt > 0 ||
6221             softdep_deps != 0 ||
6222             ump->softdep_deps != 0 ||
6223             softdep_accdeps != ump->softdep_accdeps ||
6224             secondary_writes != 0 ||
6225             mp->mnt_secondary_writes != 0 ||
6226             secondary_accwrites != mp->mnt_secondary_accwrites)
6227                 error = EAGAIN;
6228         FREE_LOCK(&lk);
6229         VI_UNLOCK(devvp);
6230         return (error);
6231 }
6232
6233
6234 /*
6235  * Get the number of dependency structures for the file system, both
6236  * the current number and the total number allocated.  These will
6237  * later be used to detect that softdep processing has occurred.
6238  */
6239 void
6240 softdep_get_depcounts(struct mount *mp,
6241                       int *softdep_depsp,
6242                       int *softdep_accdepsp)
6243 {
6244         struct ufsmount *ump;
6245
6246         ump = VFSTOUFS(mp);
6247         ACQUIRE_LOCK(&lk);
6248         *softdep_depsp = ump->softdep_deps;
6249         *softdep_accdepsp = ump->softdep_accdeps;
6250         FREE_LOCK(&lk);
6251 }
6252
6253 /*
6254  * Wait for pending output on a vnode to complete.
6255  * Must be called with vnode lock and interlock locked.
6256  *
6257  * XXX: Should just be a call to bufobj_wwait().
6258  */
6259 static void
6260 drain_output(vp)
6261         struct vnode *vp;
6262 {
6263         ASSERT_VOP_LOCKED(vp, "drain_output");
6264         ASSERT_VI_LOCKED(vp, "drain_output");
6265
6266         while (vp->v_bufobj.bo_numoutput) {
6267                 vp->v_bufobj.bo_flag |= BO_WWAIT;
6268                 msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
6269                     VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
6270         }
6271 }
6272
6273 /*
6274  * Called whenever a buffer that is being invalidated or reallocated
6275  * contains dependencies. This should only happen if an I/O error has
6276  * occurred. The routine is called with the buffer locked.
6277  */
6278 static void
6279 softdep_deallocate_dependencies(bp)
6280         struct buf *bp;
6281 {
6282
6283         if ((bp->b_ioflags & BIO_ERROR) == 0)
6284                 panic("softdep_deallocate_dependencies: dangling deps");
6285         softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
6286         panic("softdep_deallocate_dependencies: unrecovered I/O error");
6287 }
6288
6289 /*
6290  * Function to handle asynchronous write errors in the filesystem.
6291  */
6292 static void
6293 softdep_error(func, error)
6294         char *func;
6295         int error;
6296 {
6297
6298         /* XXX should do something better! */
6299         printf("%s: got error %d while accessing filesystem\n", func, error);
6300 }
6301
6302 #endif /* SOFTUPDATES */