sys/ufs/ffs/ffs_softdep.c

   1 /*-
   2  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
   3  *
   4  * The soft updates code is derived from the appendix of a University
   5  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
   6  * "Soft Updates: A Solution to the Metadata Update Problem in File
   7  * Systems", CSE-TR-254-95, August 1995).
   8  *
   9  * Further information about soft updates can be obtained from:
  10  *
  11  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  12  *      1614 Oxford Street              mckusick@mckusick.com
  13  *      Berkeley, CA 94709-1608         +1-510-843-9542
  14  *      USA
  15  *
  16  * Redistribution and use in source and binary forms, with or without
  17  * modification, are permitted provided that the following conditions
  18  * are met:
  19  *
  20  * 1. Redistributions of source code must retain the above copyright
  21  *    notice, this list of conditions and the following disclaimer.
  22  * 2. Redistributions in binary form must reproduce the above copyright
  23  *    notice, this list of conditions and the following disclaimer in the
  24  *    documentation and/or other materials provided with the distribution.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  27  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  29  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  30  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  39  */
  40
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * For now we want the safety net that the DEBUG flag provides.
  46  */
  47 #ifndef DEBUG
  48 #define DEBUG
  49 #endif
  50
  51 #include <sys/param.h>
  52 #include <sys/kernel.h>
  53 #include <sys/systm.h>
  54 #include <sys/bio.h>
  55 #include <sys/buf.h>
  56 #include <sys/kdb.h>
  57 #include <sys/kthread.h>
  58 #include <sys/lock.h>
  59 #include <sys/malloc.h>
  60 #include <sys/mount.h>
  61 #include <sys/mutex.h>
  62 #include <sys/proc.h>
  63 #include <sys/stat.h>
  64 #include <sys/sysctl.h>
  65 #include <sys/syslog.h>
  66 #include <sys/vnode.h>
  67 #include <sys/conf.h>
  68 #include <ufs/ufs/dir.h>
  69 #include <ufs/ufs/extattr.h>
  70 #include <ufs/ufs/quota.h>
  71 #include <ufs/ufs/inode.h>
  72 #include <ufs/ufs/ufsmount.h>
  73 #include <ufs/ffs/fs.h>
  74 #include <ufs/ffs/softdep.h>
  75 #include <ufs/ffs/ffs_extern.h>
  76 #include <ufs/ufs/ufs_extern.h>
  77
  78 #include <vm/vm.h>
  79
  80 #include "opt_ffs.h"
  81
  82 #ifndef SOFTUPDATES
  83
  84 int
  85 softdep_flushfiles(oldmnt, flags, td)
  86         struct mount *oldmnt;
  87         int flags;
  88         struct thread *td;
  89 {
  90
  91         panic("softdep_flushfiles called");
  92 }
  93
  94 int
  95 softdep_mount(devvp, mp, fs, cred)
  96         struct vnode *devvp;
  97         struct mount *mp;
  98         struct fs *fs;
  99         struct ucred *cred;
 100 {
 101
 102         return (0);
 103 }
 104
 105 void
 106 softdep_initialize()
 107 {
 108
 109         return;
 110 }
 111
 112 void
 113 softdep_uninitialize()
 114 {
 115
 116         return;
 117 }
 118
 119 void
 120 softdep_setup_inomapdep(bp, ip, newinum)
 121         struct buf *bp;
 122         struct inode *ip;
 123         ino_t newinum;
 124 {
 125
 126         panic("softdep_setup_inomapdep called");
 127 }
 128
 129 void
 130 softdep_setup_blkmapdep(bp, mp, newblkno)
 131         struct buf *bp;
 132         struct mount *mp;
 133         ufs2_daddr_t newblkno;
 134 {
 135
 136         panic("softdep_setup_blkmapdep called");
 137 }
 138
 139 void
 140 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 141         struct inode *ip;
 142         ufs_lbn_t lbn;
 143         ufs2_daddr_t newblkno;
 144         ufs2_daddr_t oldblkno;
 145         long newsize;
 146         long oldsize;
 147         struct buf *bp;
 148 {
 149
 150         panic("softdep_setup_allocdirect called");
 151 }
 152
 153 void
 154 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 155         struct inode *ip;
 156         ufs_lbn_t lbn;
 157         ufs2_daddr_t newblkno;
 158         ufs2_daddr_t oldblkno;
 159         long newsize;
 160         long oldsize;
 161         struct buf *bp;
 162 {
 163
 164         panic("softdep_setup_allocext called");
 165 }
 166
 167 void
 168 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 169         struct inode *ip;
 170         ufs_lbn_t lbn;
 171         struct buf *bp;
 172         int ptrno;
 173         ufs2_daddr_t newblkno;
 174         ufs2_daddr_t oldblkno;
 175         struct buf *nbp;
 176 {
 177
 178         panic("softdep_setup_allocindir_page called");
 179 }
 180
 181 void
 182 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 183         struct buf *nbp;
 184         struct inode *ip;
 185         struct buf *bp;
 186         int ptrno;
 187         ufs2_daddr_t newblkno;
 188 {
 189
 190         panic("softdep_setup_allocindir_meta called");
 191 }
 192
 193 void
 194 softdep_setup_freeblocks(ip, length, flags)
 195         struct inode *ip;
 196         off_t length;
 197         int flags;
 198 {
 199
 200         panic("softdep_setup_freeblocks called");
 201 }
 202
 203 void
 204 softdep_freefile(pvp, ino, mode)
 205                 struct vnode *pvp;
 206                 ino_t ino;
 207                 int mode;
 208 {
 209
 210         panic("softdep_freefile called");
 211 }
 212
 213 int
 214 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 215         struct buf *bp;
 216         struct inode *dp;
 217         off_t diroffset;
 218         ino_t newinum;
 219         struct buf *newdirbp;
 220         int isnewblk;
 221 {
 222
 223         panic("softdep_setup_directory_add called");
 224 }
 225
 226 void
 227 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 228         struct inode *dp;
 229         caddr_t base;
 230         caddr_t oldloc;
 231         caddr_t newloc;
 232         int entrysize;
 233 {
 234
 235         panic("softdep_change_directoryentry_offset called");
 236 }
 237
 238 void
 239 softdep_setup_remove(bp, dp, ip, isrmdir)
 240         struct buf *bp;
 241         struct inode *dp;
 242         struct inode *ip;
 243         int isrmdir;
 244 {
 245
 246         panic("softdep_setup_remove called");
 247 }
 248
 249 void
 250 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 251         struct buf *bp;
 252         struct inode *dp;
 253         struct inode *ip;
 254         ino_t newinum;
 255         int isrmdir;
 256 {
 257
 258         panic("softdep_setup_directory_change called");
 259 }
 260
 261 void
 262 softdep_change_linkcnt(ip)
 263         struct inode *ip;
 264 {
 265
 266         panic("softdep_change_linkcnt called");
 267 }
 268
 269 void
 270 softdep_load_inodeblock(ip)
 271         struct inode *ip;
 272 {
 273
 274         panic("softdep_load_inodeblock called");
 275 }
 276
 277 void
 278 softdep_update_inodeblock(ip, bp, waitfor)
 279         struct inode *ip;
 280         struct buf *bp;
 281         int waitfor;
 282 {
 283
 284         panic("softdep_update_inodeblock called");
 285 }
 286
 287 int
 288 softdep_fsync(vp)
 289         struct vnode *vp;       /* the "in_core" copy of the inode */
 290 {
 291
 292         return (0);
 293 }
 294
 295 void
 296 softdep_fsync_mountdev(vp)
 297         struct vnode *vp;
 298 {
 299
 300         return;
 301 }
 302
 303 int
 304 softdep_flushworklist(oldmnt, countp, td)
 305         struct mount *oldmnt;
 306         int *countp;
 307         struct thread *td;
 308 {
 309
 310         *countp = 0;
 311         return (0);
 312 }
 313
 314 int
 315 softdep_sync_metadata(struct vnode *vp)
 316 {
 317
 318         return (0);
 319 }
 320
 321 int
 322 softdep_slowdown(vp)
 323         struct vnode *vp;
 324 {
 325
 326         panic("softdep_slowdown called");
 327 }
 328
 329 void
 330 softdep_releasefile(ip)
 331         struct inode *ip;       /* inode with the zero effective link count */
 332 {
 333
 334         panic("softdep_releasefile called");
 335 }
 336
 337 int
 338 softdep_request_cleanup(fs, vp)
 339         struct fs *fs;
 340         struct vnode *vp;
 341 {
 342
 343         return (0);
 344 }
 345
 346 int
 347 softdep_check_suspend(struct mount *mp,
 348                       struct vnode *devvp,
 349                       int softdep_deps,
 350                       int softdep_accdeps,
 351                       int secondary_writes,
 352                       int secondary_accwrites)
 353 {
 354         struct bufobj *bo;
 355         int error;
 356
 357         (void) softdep_deps,
 358         (void) softdep_accdeps;
 359
 360         bo = &devvp->v_bufobj;
 361         ASSERT_BO_LOCKED(bo);
 362
 363         MNT_ILOCK(mp);
 364         while (mp->mnt_secondary_writes != 0) {
 365                 BO_UNLOCK(bo);
 366                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 367                     (PUSER - 1) | PDROP, "secwr", 0);
 368                 BO_LOCK(bo);
 369                 MNT_ILOCK(mp);
 370         }
 371
 372         /*
 373          * Reasons for needing more work before suspend:
 374          * - Dirty buffers on devvp.
 375          * - Secondary writes occurred after start of vnode sync loop
 376          */
 377         error = 0;
 378         if (bo->bo_numoutput > 0 ||
 379             bo->bo_dirty.bv_cnt > 0 ||
 380             secondary_writes != 0 ||
 381             mp->mnt_secondary_writes != 0 ||
 382             secondary_accwrites != mp->mnt_secondary_accwrites)
 383                 error = EAGAIN;
 384         BO_UNLOCK(bo);
 385         return (error);
 386 }
 387
 388 void
 389 softdep_get_depcounts(struct mount *mp,
 390                       int *softdepactivep,
 391                       int *softdepactiveaccp)
 392 {
 393         (void) mp;
 394         *softdepactivep = 0;
 395         *softdepactiveaccp = 0;
 396 }
 397
 398 #else
 399 /*
 400  * These definitions need to be adapted to the system to which
 401  * this file is being ported.
 402  */
 403 /*
 404  * malloc types defined for the softdep system.
 405  */
 406 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
 407 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
 408 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
 409 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
 410 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
 411 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
 412 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
 413 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
 414 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
 415 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
 416 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 417 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 418 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
 419 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
 420 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
 421
 422 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
 423
 424 #define D_PAGEDEP       0
 425 #define D_INODEDEP      1
 426 #define D_NEWBLK        2
 427 #define D_BMSAFEMAP     3
 428 #define D_ALLOCDIRECT   4
 429 #define D_INDIRDEP      5
 430 #define D_ALLOCINDIR    6
 431 #define D_FREEFRAG      7
 432 #define D_FREEBLKS      8
 433 #define D_FREEFILE      9
 434 #define D_DIRADD        10
 435 #define D_MKDIR         11
 436 #define D_DIRREM        12
 437 #define D_NEWDIRBLK     13
 438 #define D_LAST          D_NEWDIRBLK
 439
 440 /*
 441  * translate from workitem type to memory type
 442  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 443  */
 444 static struct malloc_type *memtype[] = {
 445         M_PAGEDEP,
 446         M_INODEDEP,
 447         M_NEWBLK,
 448         M_BMSAFEMAP,
 449         M_ALLOCDIRECT,
 450         M_INDIRDEP,
 451         M_ALLOCINDIR,
 452         M_FREEFRAG,
 453         M_FREEBLKS,
 454         M_FREEFILE,
 455         M_DIRADD,
 456         M_MKDIR,
 457         M_DIRREM,
 458         M_NEWDIRBLK
 459 };
 460
 461 #define DtoM(type) (memtype[type])
 462
 463 /*
 464  * Names of malloc types.
 465  */
 466 #define TYPENAME(type)  \
 467         ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
 468 /*
 469  * End system adaptation definitions.
 470  */
 471
 472 /*
 473  * Forward declarations.
 474  */
 475 struct inodedep_hashhead;
 476 struct newblk_hashhead;
 477 struct pagedep_hashhead;
 478
 479 /*
 480  * Internal function prototypes.
 481  */
 482 static  void softdep_error(char *, int);
 483 static  void drain_output(struct vnode *);
 484 static  struct buf *getdirtybuf(struct buf *, struct mtx *, int);
 485 static  void clear_remove(struct thread *);
 486 static  void clear_inodedeps(struct thread *);
 487 static  int flush_pagedep_deps(struct vnode *, struct mount *,
 488             struct diraddhd *);
 489 static  int flush_inodedep_deps(struct mount *, ino_t);
 490 static  int flush_deplist(struct allocdirectlst *, int, int *);
 491 static  int handle_written_filepage(struct pagedep *, struct buf *);
 492 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 493 static  int handle_written_inodeblock(struct inodedep *, struct buf *);
 494 static  void handle_allocdirect_partdone(struct allocdirect *);
 495 static  void handle_allocindir_partdone(struct allocindir *);
 496 static  void initiate_write_filepage(struct pagedep *, struct buf *);
 497 static  void handle_written_mkdir(struct mkdir *, int);
 498 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 499 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 500 static  void handle_workitem_freefile(struct freefile *);
 501 static  void handle_workitem_remove(struct dirrem *, struct vnode *);
 502 static  struct dirrem *newdirrem(struct buf *, struct inode *,
 503             struct inode *, int, struct dirrem **);
 504 static  void free_diradd(struct diradd *);
 505 static  void free_allocindir(struct allocindir *, struct inodedep *);
 506 static  void free_newdirblk(struct newdirblk *);
 507 static  int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
 508             ufs2_daddr_t *);
 509 static  void deallocate_dependencies(struct buf *, struct inodedep *);
 510 static  void free_allocdirect(struct allocdirectlst *,
 511             struct allocdirect *, int);
 512 static  int check_inode_unwritten(struct inodedep *);
 513 static  int free_inodedep(struct inodedep *);
 514 static  void handle_workitem_freeblocks(struct freeblks *, int);
 515 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 516 static  void setup_allocindir_phase2(struct buf *, struct inode *,
 517             struct allocindir *);
 518 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 519             ufs2_daddr_t);
 520 static  void handle_workitem_freefrag(struct freefrag *);
 521 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
 522 static  void allocdirect_merge(struct allocdirectlst *,
 523             struct allocdirect *, struct allocdirect *);
 524 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
 525 static  int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
 526             struct newblk **);
 527 static  int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
 528 static  int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
 529             struct inodedep **);
 530 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 531 static  int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
 532 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 533             struct mount *mp, int, struct pagedep **);
 534 static  void pause_timer(void *);
 535 static  int request_cleanup(struct mount *, int);
 536 static  int process_worklist_item(struct mount *, int);
 537 static  void add_to_worklist(struct worklist *);
 538 static  void softdep_flush(void);
 539 static  int softdep_speedup(void);
 540
 541 /*
 542  * Exported softdep operations.
 543  */
 544 static  void softdep_disk_io_initiation(struct buf *);
 545 static  void softdep_disk_write_complete(struct buf *);
 546 static  void softdep_deallocate_dependencies(struct buf *);
 547 static  int softdep_count_dependencies(struct buf *bp, int);
 548
 549 static struct mtx lk;
 550 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
 551
 552 #define TRY_ACQUIRE_LOCK(lk)            mtx_trylock(lk)
 553 #define ACQUIRE_LOCK(lk)                mtx_lock(lk)
 554 #define FREE_LOCK(lk)                   mtx_unlock(lk)
 555
 556 #define BUF_AREC(bp)    ((bp)->b_lock.lock_object.lo_flags |= LK_CANRECURSE)
 557 #define BUF_NOREC(bp)   ((bp)->b_lock.lock_object.lo_flags &= ~LK_CANRECURSE)
 558
 559 /*
 560  * Worklist queue management.
 561  * These routines require that the lock be held.
 562  */
 563 #ifndef /* NOT */ DEBUG
 564 #define WORKLIST_INSERT(head, item) do {        \
 565         (item)->wk_state |= ONWORKLIST;         \
 566         LIST_INSERT_HEAD(head, item, wk_list);  \
 567 } while (0)
 568 #define WORKLIST_REMOVE(item) do {              \
 569         (item)->wk_state &= ~ONWORKLIST;        \
 570         LIST_REMOVE(item, wk_list);             \
 571 } while (0)
 572 #else /* DEBUG */
 573 static  void worklist_insert(struct workhead *, struct worklist *);
 574 static  void worklist_remove(struct worklist *);
 575
 576 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
 577 #define WORKLIST_REMOVE(item) worklist_remove(item)
 578
 579 static void
 580 worklist_insert(head, item)
 581         struct workhead *head;
 582         struct worklist *item;
 583 {
 584
 585         mtx_assert(&lk, MA_OWNED);
 586         if (item->wk_state & ONWORKLIST)
 587                 panic("worklist_insert: already on list");
 588         item->wk_state |= ONWORKLIST;
 589         LIST_INSERT_HEAD(head, item, wk_list);
 590 }
 591
 592 static void
 593 worklist_remove(item)
 594         struct worklist *item;
 595 {
 596
 597         mtx_assert(&lk, MA_OWNED);
 598         if ((item->wk_state & ONWORKLIST) == 0)
 599                 panic("worklist_remove: not on list");
 600         item->wk_state &= ~ONWORKLIST;
 601         LIST_REMOVE(item, wk_list);
 602 }
 603 #endif /* DEBUG */
 604
 605 /*
 606  * Routines for tracking and managing workitems.
 607  */
 608 static  void workitem_free(struct worklist *, int);
 609 static  void workitem_alloc(struct worklist *, int, struct mount *);
 610
 611 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
 612
 613 static void
 614 workitem_free(item, type)
 615         struct worklist *item;
 616         int type;
 617 {
 618         struct ufsmount *ump;
 619         mtx_assert(&lk, MA_OWNED);
 620
 621 #ifdef DEBUG
 622         if (item->wk_state & ONWORKLIST)
 623                 panic("workitem_free: still on list");
 624         if (item->wk_type != type)
 625                 panic("workitem_free: type mismatch");
 626 #endif
 627         ump = VFSTOUFS(item->wk_mp);
 628         if (--ump->softdep_deps == 0 && ump->softdep_req)
 629                 wakeup(&ump->softdep_deps);
 630         FREE(item, DtoM(type));
 631 }
 632
 633 static void
 634 workitem_alloc(item, type, mp)
 635         struct worklist *item;
 636         int type;
 637         struct mount *mp;
 638 {
 639         item->wk_type = type;
 640         item->wk_mp = mp;
 641         item->wk_state = 0;
 642         ACQUIRE_LOCK(&lk);
 643         VFSTOUFS(mp)->softdep_deps++;
 644         VFSTOUFS(mp)->softdep_accdeps++;
 645         FREE_LOCK(&lk);
 646 }
 647
 648 /*
 649  * Workitem queue management
 650  */
 651 static int max_softdeps;        /* maximum number of structs before slowdown */
 652 static int maxindirdeps = 50;   /* max number of indirdeps before slowdown */
 653 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
 654 static int proc_waiting;        /* tracks whether we have a timeout posted */
 655 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
 656 static struct callout softdep_callout;
 657 static int req_pending;
 658 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
 659 #define FLUSH_INODES            1
 660 static int req_clear_remove;    /* syncer process flush some freeblks */
 661 #define FLUSH_REMOVE            2
 662 #define FLUSH_REMOVE_WAIT       3
 663 /*
 664  * runtime statistics
 665  */
 666 static int stat_worklist_push;  /* number of worklist cleanups */
 667 static int stat_blk_limit_push; /* number of times block limit neared */
 668 static int stat_ino_limit_push; /* number of times inode limit neared */
 669 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
 670 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
 671 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
 672 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
 673 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
 674 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 675 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
 676
 677 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
 678 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
 679 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
 680 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
 681 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
 682 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
 683 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
 684 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
 685 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
 686 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
 687 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
 688 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
 689 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
 690 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
 691
 692 SYSCTL_DECL(_vfs_ffs);
 693
 694 static int compute_summary_at_mount = 0;        /* Whether to recompute the summary at mount time */
 695 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 696            &compute_summary_at_mount, 0, "Recompute summary at mount");
 697
 698 static struct proc *softdepproc;
 699 static struct kproc_desc softdep_kp = {
 700         "softdepflush",
 701         softdep_flush,
 702         &softdepproc
 703 };
 704 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
 705     &softdep_kp);
 706
 707 static void
 708 softdep_flush(void)
 709 {
 710         struct mount *nmp;
 711         struct mount *mp;
 712         struct ufsmount *ump;
 713         struct thread *td;
 714         int remaining;
 715         int vfslocked;
 716
 717         td = curthread;
 718         td->td_pflags |= TDP_NORUNNINGBUF;
 719
 720         for (;;) {
 721                 kproc_suspend_check(softdepproc);
 722                 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
 723                 ACQUIRE_LOCK(&lk);
 724                 /*
 725                  * If requested, try removing inode or removal dependencies.
 726                  */
 727                 if (req_clear_inodedeps) {
 728                         clear_inodedeps(td);
 729                         req_clear_inodedeps -= 1;
 730                         wakeup_one(&proc_waiting);
 731                 }
 732                 if (req_clear_remove) {
 733                         clear_remove(td);
 734                         req_clear_remove -= 1;
 735                         wakeup_one(&proc_waiting);
 736                 }
 737                 FREE_LOCK(&lk);
 738                 VFS_UNLOCK_GIANT(vfslocked);
 739                 remaining = 0;
 740                 mtx_lock(&mountlist_mtx);
 741                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
 742                         nmp = TAILQ_NEXT(mp, mnt_list);
 743                         if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
 744                                 continue;
 745                         if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 746                                 continue;
 747                         vfslocked = VFS_LOCK_GIANT(mp);
 748                         softdep_process_worklist(mp, 0);
 749                         ump = VFSTOUFS(mp);
 750                         remaining += ump->softdep_on_worklist -
 751                                 ump->softdep_on_worklist_inprogress;
 752                         VFS_UNLOCK_GIANT(vfslocked);
 753                         mtx_lock(&mountlist_mtx);
 754                         nmp = TAILQ_NEXT(mp, mnt_list);
 755                         vfs_unbusy(mp, td);
 756                 }
 757                 mtx_unlock(&mountlist_mtx);
 758                 if (remaining)
 759                         continue;
 760                 ACQUIRE_LOCK(&lk);
 761                 if (!req_pending)
 762                         msleep(&req_pending, &lk, PVM, "sdflush", hz);
 763                 req_pending = 0;
 764                 FREE_LOCK(&lk);
 765         }
 766 }
 767
 768 static int
 769 softdep_speedup(void)
 770 {
 771
 772         mtx_assert(&lk, MA_OWNED);
 773         if (req_pending == 0) {
 774                 req_pending = 1;
 775                 wakeup(&req_pending);
 776         }
 777
 778         return speedup_syncer();
 779 }
 780
 781 /*
 782  * Add an item to the end of the work queue.
 783  * This routine requires that the lock be held.
 784  * This is the only routine that adds items to the list.
 785  * The following routine is the only one that removes items
 786  * and does so in order from first to last.
 787  */
 788 static void
 789 add_to_worklist(wk)
 790         struct worklist *wk;
 791 {
 792         struct ufsmount *ump;
 793
 794         mtx_assert(&lk, MA_OWNED);
 795         ump = VFSTOUFS(wk->wk_mp);
 796         if (wk->wk_state & ONWORKLIST)
 797                 panic("add_to_worklist: already on list");
 798         wk->wk_state |= ONWORKLIST;
 799         if (LIST_EMPTY(&ump->softdep_workitem_pending))
 800                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 801         else
 802                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 803         ump->softdep_worklist_tail = wk;
 804         ump->softdep_on_worklist += 1;
 805 }
 806
 807 /*
 808  * Process that runs once per second to handle items in the background queue.
 809  *
 810  * Note that we ensure that everything is done in the order in which they
 811  * appear in the queue. The code below depends on this property to ensure
 812  * that blocks of a file are freed before the inode itself is freed. This
 813  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
 814  * until all the old ones have been purged from the dependency lists.
 815  */
 816 int
 817 softdep_process_worklist(mp, full)
 818         struct mount *mp;
 819         int full;
 820 {
 821         struct thread *td = curthread;
 822         int cnt, matchcnt, loopcount;
 823         struct ufsmount *ump;
 824         long starttime;
 825
 826         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 827         /*
 828          * Record the process identifier of our caller so that we can give
 829          * this process preferential treatment in request_cleanup below.
 830          */
 831         matchcnt = 0;
 832         ump = VFSTOUFS(mp);
 833         ACQUIRE_LOCK(&lk);
 834         loopcount = 1;
 835         starttime = time_second;
 836         while (ump->softdep_on_worklist > 0) {
 837                 if ((cnt = process_worklist_item(mp, 0)) == -1)
 838                         break;
 839                 else
 840                         matchcnt += cnt;
 841                 /*
 842                  * If requested, try removing inode or removal dependencies.
 843                  */
 844                 if (req_clear_inodedeps) {
 845                         clear_inodedeps(td);
 846                         req_clear_inodedeps -= 1;
 847                         wakeup_one(&proc_waiting);
 848                 }
 849                 if (req_clear_remove) {
 850                         clear_remove(td);
 851                         req_clear_remove -= 1;
 852                         wakeup_one(&proc_waiting);
 853                 }
 854                 /*
 855                  * We do not generally want to stop for buffer space, but if
 856                  * we are really being a buffer hog, we will stop and wait.
 857                  */
 858                 if (loopcount++ % 128 == 0) {
 859                         FREE_LOCK(&lk);
 860                         uio_yield();
 861                         bwillwrite();
 862                         ACQUIRE_LOCK(&lk);
 863                 }
 864                 /*
 865                  * Never allow processing to run for more than one
 866                  * second. Otherwise the other mountpoints may get
 867                  * excessively backlogged.
 868                  */
 869                 if (!full && starttime != time_second) {
 870                         matchcnt = -1;
 871                         break;
 872                 }
 873         }
 874         FREE_LOCK(&lk);
 875         return (matchcnt);
 876 }
 877
 878 /*
 879  * Process one item on the worklist.
 880  */
 881 static int
 882 process_worklist_item(mp, flags)
 883         struct mount *mp;
 884         int flags;
 885 {
 886         struct worklist *wk, *wkend;
 887         struct ufsmount *ump;
 888         struct vnode *vp;
 889         int matchcnt = 0;
 890
 891         mtx_assert(&lk, MA_OWNED);
 892         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 893         /*
 894          * If we are being called because of a process doing a
 895          * copy-on-write, then it is not safe to write as we may
 896          * recurse into the copy-on-write routine.
 897          */
 898         if (curthread->td_pflags & TDP_COWINPROGRESS)
 899                 return (-1);
 900         /*
 901          * Normally we just process each item on the worklist in order.
 902          * However, if we are in a situation where we cannot lock any
 903          * inodes, we have to skip over any dirrem requests whose
 904          * vnodes are resident and locked.
 905          */
 906         ump = VFSTOUFS(mp);
 907         vp = NULL;
 908         LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
 909                 if (wk->wk_state & INPROGRESS)
 910                         continue;
 911                 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
 912                         break;
 913                 wk->wk_state |= INPROGRESS;
 914                 ump->softdep_on_worklist_inprogress++;
 915                 FREE_LOCK(&lk);
 916                 ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
 917                     LK_NOWAIT | LK_EXCLUSIVE, &vp);
 918                 ACQUIRE_LOCK(&lk);
 919                 wk->wk_state &= ~INPROGRESS;
 920                 ump->softdep_on_worklist_inprogress--;
 921                 if (vp != NULL)
 922                         break;
 923         }
 924         if (wk == 0)
 925                 return (-1);
 926         /*
 927          * Remove the item to be processed. If we are removing the last
 928          * item on the list, we need to recalculate the tail pointer.
 929          * As this happens rarely and usually when the list is short,
 930          * we just run down the list to find it rather than tracking it
 931          * in the above loop.
 932          */
 933         WORKLIST_REMOVE(wk);
 934         if (wk == ump->softdep_worklist_tail) {
 935                 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
 936                         if (LIST_NEXT(wkend, wk_list) == NULL)
 937                                 break;
 938                 ump->softdep_worklist_tail = wkend;
 939         }
 940         ump->softdep_on_worklist -= 1;
 941         FREE_LOCK(&lk);
 942         if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 943                 panic("process_worklist_item: suspended filesystem");
 944         matchcnt++;
 945         switch (wk->wk_type) {
 946
 947         case D_DIRREM:
 948                 /* removal of a directory entry */
 949                 handle_workitem_remove(WK_DIRREM(wk), vp);
 950                 break;
 951
 952         case D_FREEBLKS:
 953                 /* releasing blocks and/or fragments from a file */
 954                 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
 955                 break;
 956
 957         case D_FREEFRAG:
 958                 /* releasing a fragment when replaced as a file grows */
 959                 handle_workitem_freefrag(WK_FREEFRAG(wk));
 960                 break;
 961
 962         case D_FREEFILE:
 963                 /* releasing an inode when its link count drops to 0 */
 964                 handle_workitem_freefile(WK_FREEFILE(wk));
 965                 break;
 966
 967         default:
 968                 panic("%s_process_worklist: Unknown type %s",
 969                     "softdep", TYPENAME(wk->wk_type));
 970                 /* NOTREACHED */
 971         }
 972         vn_finished_secondary_write(mp);
 973         ACQUIRE_LOCK(&lk);
 974         return (matchcnt);
 975 }
 976
 977 /*
 978  * Move dependencies from one buffer to another.
 979  */
 980 void
 981 softdep_move_dependencies(oldbp, newbp)
 982         struct buf *oldbp;
 983         struct buf *newbp;
 984 {
 985         struct worklist *wk, *wktail;
 986
 987         if (!LIST_EMPTY(&newbp->b_dep))
 988                 panic("softdep_move_dependencies: need merge code");
 989         wktail = 0;
 990         ACQUIRE_LOCK(&lk);
 991         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 992                 LIST_REMOVE(wk, wk_list);
 993                 if (wktail == 0)
 994                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 995                 else
 996                         LIST_INSERT_AFTER(wktail, wk, wk_list);
 997                 wktail = wk;
 998         }
 999         FREE_LOCK(&lk);
1000 }
1001
1002 /*
1003  * Purge the work list of all items associated with a particular mount point.
1004  */
1005 int
1006 softdep_flushworklist(oldmnt, countp, td)
1007         struct mount *oldmnt;
1008         int *countp;
1009         struct thread *td;
1010 {
1011         struct vnode *devvp;
1012         int count, error = 0;
1013         struct ufsmount *ump;
1014
1015         /*
1016          * Alternately flush the block device associated with the mount
1017          * point and process any dependencies that the flushing
1018          * creates. We continue until no more worklist dependencies
1019          * are found.
1020          */
1021         *countp = 0;
1022         ump = VFSTOUFS(oldmnt);
1023         devvp = ump->um_devvp;
1024         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1025                 *countp += count;
1026                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1027                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1028                 VOP_UNLOCK(devvp, 0);
1029                 if (error)
1030                         break;
1031         }
1032         return (error);
1033 }
1034
1035 int
1036 softdep_waitidle(struct mount *mp)
1037 {
1038         struct ufsmount *ump;
1039         int error;
1040         int i;
1041
1042         ump = VFSTOUFS(mp);
1043         ACQUIRE_LOCK(&lk);
1044         for (i = 0; i < 10 && ump->softdep_deps; i++) {
1045                 ump->softdep_req = 1;
1046                 if (ump->softdep_on_worklist)
1047                         panic("softdep_waitidle: work added after flush.");
1048                 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1049         }
1050         ump->softdep_req = 0;
1051         FREE_LOCK(&lk);
1052         error = 0;
1053         if (i == 10) {
1054                 error = EBUSY;
1055                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1056                     mp);
1057         }
1058
1059         return (error);
1060 }
1061
1062 /*
1063  * Flush all vnodes and worklist items associated with a specified mount point.
1064  */
1065 int
1066 softdep_flushfiles(oldmnt, flags, td)
1067         struct mount *oldmnt;
1068         int flags;
1069         struct thread *td;
1070 {
1071         int error, count, loopcnt;
1072
1073         error = 0;
1074
1075         /*
1076          * Alternately flush the vnodes associated with the mount
1077          * point and process any dependencies that the flushing
1078          * creates. In theory, this loop can happen at most twice,
1079          * but we give it a few extra just to be sure.
1080          */
1081         for (loopcnt = 10; loopcnt > 0; loopcnt--) {
1082                 /*
1083                  * Do another flush in case any vnodes were brought in
1084                  * as part of the cleanup operations.
1085                  */
1086                 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1087                         break;
1088                 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
1089                     count == 0)
1090                         break;
1091         }
1092         /*
1093          * If we are unmounting then it is an error to fail. If we
1094          * are simply trying to downgrade to read-only, then filesystem
1095          * activity can keep us busy forever, so we just fail with EBUSY.
1096          */
1097         if (loopcnt == 0) {
1098                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1099                         panic("softdep_flushfiles: looping");
1100                 error = EBUSY;
1101         }
1102         if (!error)
1103                 error = softdep_waitidle(oldmnt);
1104         return (error);
1105 }
1106
1107 /*
1108  * Structure hashing.
1109  *
1110  * There are three types of structures that can be looked up:
1111  *      1) pagedep structures identified by mount point, inode number,
1112  *         and logical block.
1113  *      2) inodedep structures identified by mount point and inode number.
1114  *      3) newblk structures identified by mount point and
1115  *         physical block number.
1116  *
1117  * The "pagedep" and "inodedep" dependency structures are hashed
1118  * separately from the file blocks and inodes to which they correspond.
1119  * This separation helps when the in-memory copy of an inode or
1120  * file block must be replaced. It also obviates the need to access
1121  * an inode or file page when simply updating (or de-allocating)
1122  * dependency structures. Lookup of newblk structures is needed to
1123  * find newly allocated blocks when trying to associate them with
1124  * their allocdirect or allocindir structure.
1125  *
1126  * The lookup routines optionally create and hash a new instance when
1127  * an existing entry is not found.
1128  */
1129 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
1130 #define NODELAY         0x0002  /* cannot do background work */
1131
1132 /*
1133  * Structures and routines associated with pagedep caching.
1134  */
1135 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1136 u_long  pagedep_hash;           /* size of hash table - 1 */
1137 #define PAGEDEP_HASH(mp, inum, lbn) \
1138         (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1139             pagedep_hash])
1140
1141 static int
1142 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1143         struct pagedep_hashhead *pagedephd;
1144         ino_t ino;
1145         ufs_lbn_t lbn;
1146         struct mount *mp;
1147         int flags;
1148         struct pagedep **pagedeppp;
1149 {
1150         struct pagedep *pagedep;
1151
1152         LIST_FOREACH(pagedep, pagedephd, pd_hash)
1153                 if (ino == pagedep->pd_ino &&
1154                     lbn == pagedep->pd_lbn &&
1155                     mp == pagedep->pd_list.wk_mp)
1156                         break;
1157         if (pagedep) {
1158                 *pagedeppp = pagedep;
1159                 if ((flags & DEPALLOC) != 0 &&
1160                     (pagedep->pd_state & ONWORKLIST) == 0)
1161                         return (0);
1162                 return (1);
1163         }
1164         *pagedeppp = NULL;
1165         return (0);
1166 }
1167 /*
1168  * Look up a pagedep. Return 1 if found, 0 if not found or found
1169  * when asked to allocate but not associated with any buffer.
1170  * If not found, allocate if DEPALLOC flag is passed.
1171  * Found or allocated entry is returned in pagedeppp.
1172  * This routine must be called with splbio interrupts blocked.
1173  */
1174 static int
1175 pagedep_lookup(ip, lbn, flags, pagedeppp)
1176         struct inode *ip;
1177         ufs_lbn_t lbn;
1178         int flags;
1179         struct pagedep **pagedeppp;
1180 {
1181         struct pagedep *pagedep;
1182         struct pagedep_hashhead *pagedephd;
1183         struct mount *mp;
1184         int ret;
1185         int i;
1186
1187         mtx_assert(&lk, MA_OWNED);
1188         mp = ITOV(ip)->v_mount;
1189         pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1190
1191         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1192         if (*pagedeppp || (flags & DEPALLOC) == 0)
1193                 return (ret);
1194         FREE_LOCK(&lk);
1195         MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
1196             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1197         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1198         ACQUIRE_LOCK(&lk);
1199         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1200         if (*pagedeppp) {
1201                 WORKITEM_FREE(pagedep, D_PAGEDEP);
1202                 return (ret);
1203         }
1204         pagedep->pd_ino = ip->i_number;
1205         pagedep->pd_lbn = lbn;
1206         LIST_INIT(&pagedep->pd_dirremhd);
1207         LIST_INIT(&pagedep->pd_pendinghd);
1208         for (i = 0; i < DAHASHSZ; i++)
1209                 LIST_INIT(&pagedep->pd_diraddhd[i]);
1210         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1211         *pagedeppp = pagedep;
1212         return (0);
1213 }
1214
1215 /*
1216  * Structures and routines associated with inodedep caching.
1217  */
1218 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1219 static u_long   inodedep_hash;  /* size of hash table - 1 */
1220 static long     num_inodedep;   /* number of inodedep allocated */
1221 #define INODEDEP_HASH(fs, inum) \
1222       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1223
1224 static int
1225 inodedep_find(inodedephd, fs, inum, inodedeppp)
1226         struct inodedep_hashhead *inodedephd;
1227         struct fs *fs;
1228         ino_t inum;
1229         struct inodedep **inodedeppp;
1230 {
1231         struct inodedep *inodedep;
1232
1233         LIST_FOREACH(inodedep, inodedephd, id_hash)
1234                 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1235                         break;
1236         if (inodedep) {
1237                 *inodedeppp = inodedep;
1238                 return (1);
1239         }
1240         *inodedeppp = NULL;
1241
1242         return (0);
1243 }
1244 /*
1245  * Look up an inodedep. Return 1 if found, 0 if not found.
1246  * If not found, allocate if DEPALLOC flag is passed.
1247  * Found or allocated entry is returned in inodedeppp.
1248  * This routine must be called with splbio interrupts blocked.
1249  */
1250 static int
1251 inodedep_lookup(mp, inum, flags, inodedeppp)
1252         struct mount *mp;
1253         ino_t inum;
1254         int flags;
1255         struct inodedep **inodedeppp;
1256 {
1257         struct inodedep *inodedep;
1258         struct inodedep_hashhead *inodedephd;
1259         struct fs *fs;
1260
1261         mtx_assert(&lk, MA_OWNED);
1262         fs = VFSTOUFS(mp)->um_fs;
1263         inodedephd = INODEDEP_HASH(fs, inum);
1264
1265         if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1266                 return (1);
1267         if ((flags & DEPALLOC) == 0)
1268                 return (0);
1269         /*
1270          * If we are over our limit, try to improve the situation.
1271          */
1272         if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1273                 request_cleanup(mp, FLUSH_INODES);
1274         FREE_LOCK(&lk);
1275         MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1276                 M_INODEDEP, M_SOFTDEP_FLAGS);
1277         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1278         ACQUIRE_LOCK(&lk);
1279         if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1280                 WORKITEM_FREE(inodedep, D_INODEDEP);
1281                 return (1);
1282         }
1283         num_inodedep += 1;
1284         inodedep->id_fs = fs;
1285         inodedep->id_ino = inum;
1286         inodedep->id_state = ALLCOMPLETE;
1287         inodedep->id_nlinkdelta = 0;
1288         inodedep->id_savedino1 = NULL;
1289         inodedep->id_savedsize = -1;
1290         inodedep->id_savedextsize = -1;
1291         inodedep->id_buf = NULL;
1292         LIST_INIT(&inodedep->id_pendinghd);
1293         LIST_INIT(&inodedep->id_inowait);
1294         LIST_INIT(&inodedep->id_bufwait);
1295         TAILQ_INIT(&inodedep->id_inoupdt);
1296         TAILQ_INIT(&inodedep->id_newinoupdt);
1297         TAILQ_INIT(&inodedep->id_extupdt);
1298         TAILQ_INIT(&inodedep->id_newextupdt);
1299         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1300         *inodedeppp = inodedep;
1301         return (0);
1302 }
1303
1304 /*
1305  * Structures and routines associated with newblk caching.
1306  */
1307 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1308 u_long  newblk_hash;            /* size of hash table - 1 */
1309 #define NEWBLK_HASH(fs, inum) \
1310         (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1311
1312 static int
1313 newblk_find(newblkhd, fs, newblkno, newblkpp)
1314         struct newblk_hashhead *newblkhd;
1315         struct fs *fs;
1316         ufs2_daddr_t newblkno;
1317         struct newblk **newblkpp;
1318 {
1319         struct newblk *newblk;
1320
1321         LIST_FOREACH(newblk, newblkhd, nb_hash)
1322                 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1323                         break;
1324         if (newblk) {
1325                 *newblkpp = newblk;
1326                 return (1);
1327         }
1328         *newblkpp = NULL;
1329         return (0);
1330 }
1331
1332 /*
1333  * Look up a newblk. Return 1 if found, 0 if not found.
1334  * If not found, allocate if DEPALLOC flag is passed.
1335  * Found or allocated entry is returned in newblkpp.
1336  */
1337 static int
1338 newblk_lookup(fs, newblkno, flags, newblkpp)
1339         struct fs *fs;
1340         ufs2_daddr_t newblkno;
1341         int flags;
1342         struct newblk **newblkpp;
1343 {
1344         struct newblk *newblk;
1345         struct newblk_hashhead *newblkhd;
1346
1347         newblkhd = NEWBLK_HASH(fs, newblkno);
1348         if (newblk_find(newblkhd, fs, newblkno, newblkpp))
1349                 return (1);
1350         if ((flags & DEPALLOC) == 0)
1351                 return (0);
1352         FREE_LOCK(&lk);
1353         MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1354                 M_NEWBLK, M_SOFTDEP_FLAGS);
1355         ACQUIRE_LOCK(&lk);
1356         if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
1357                 FREE(newblk, M_NEWBLK);
1358                 return (1);
1359         }
1360         newblk->nb_state = 0;
1361         newblk->nb_fs = fs;
1362         newblk->nb_newblkno = newblkno;
1363         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1364         *newblkpp = newblk;
1365         return (0);
1366 }
1367
1368 /*
1369  * Executed during filesystem system initialization before
1370  * mounting any filesystems.
1371  */
1372 void
1373 softdep_initialize()
1374 {
1375
1376         LIST_INIT(&mkdirlisthd);
1377         max_softdeps = desiredvnodes * 4;
1378         pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1379             &pagedep_hash);
1380         inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1381         newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1382
1383         /* initialise bioops hack */
1384         bioops.io_start = softdep_disk_io_initiation;
1385         bioops.io_complete = softdep_disk_write_complete;
1386         bioops.io_deallocate = softdep_deallocate_dependencies;
1387         bioops.io_countdeps = softdep_count_dependencies;
1388
1389         /* Initialize the callout with an mtx. */
1390         callout_init_mtx(&softdep_callout, &lk, 0);
1391 }
1392
1393 /*
1394  * Executed after all filesystems have been unmounted during
1395  * filesystem module unload.
1396  */
1397 void
1398 softdep_uninitialize()
1399 {
1400
1401         callout_drain(&softdep_callout);
1402         hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1403         hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1404         hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1405 }
1406
1407 /*
1408  * Called at mount time to notify the dependency code that a
1409  * filesystem wishes to use it.
1410  */
1411 int
1412 softdep_mount(devvp, mp, fs, cred)
1413         struct vnode *devvp;
1414         struct mount *mp;
1415         struct fs *fs;
1416         struct ucred *cred;
1417 {
1418         struct csum_total cstotal;
1419         struct ufsmount *ump;
1420         struct cg *cgp;
1421         struct buf *bp;
1422         int error, cyl;
1423
1424         MNT_ILOCK(mp);
1425         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
1426         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
1427                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
1428                         MNTK_SOFTDEP;
1429                 mp->mnt_noasync++;
1430         }
1431         MNT_IUNLOCK(mp);
1432         ump = VFSTOUFS(mp);
1433         LIST_INIT(&ump->softdep_workitem_pending);
1434         ump->softdep_worklist_tail = NULL;
1435         ump->softdep_on_worklist = 0;
1436         ump->softdep_deps = 0;
1437         /*
1438          * When doing soft updates, the counters in the
1439          * superblock may have gotten out of sync. Recomputation
1440          * can take a long time and can be deferred for background
1441          * fsck.  However, the old behavior of scanning the cylinder
1442          * groups and recalculating them at mount time is available
1443          * by setting vfs.ffs.compute_summary_at_mount to one.
1444          */
1445         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
1446                 return (0);
1447         bzero(&cstotal, sizeof cstotal);
1448         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1449                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1450                     fs->fs_cgsize, cred, &bp)) != 0) {
1451                         brelse(bp);
1452                         return (error);
1453                 }
1454                 cgp = (struct cg *)bp->b_data;
1455                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1456                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1457                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1458                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1459                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1460                 brelse(bp);
1461         }
1462 #ifdef DEBUG
1463         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1464                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1465 #endif
1466         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1467         return (0);
1468 }
1469
1470 /*
1471  * Protecting the freemaps (or bitmaps).
1472  *
1473  * To eliminate the need to execute fsck before mounting a filesystem
1474  * after a power failure, one must (conservatively) guarantee that the
1475  * on-disk copy of the bitmaps never indicate that a live inode or block is
1476  * free.  So, when a block or inode is allocated, the bitmap should be
1477  * updated (on disk) before any new pointers.  When a block or inode is
1478  * freed, the bitmap should not be updated until all pointers have been
1479  * reset.  The latter dependency is handled by the delayed de-allocation
1480  * approach described below for block and inode de-allocation.  The former
1481  * dependency is handled by calling the following procedure when a block or
1482  * inode is allocated. When an inode is allocated an "inodedep" is created
1483  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1484  * Each "inodedep" is also inserted into the hash indexing structure so
1485  * that any additional link additions can be made dependent on the inode
1486  * allocation.
1487  *
1488  * The ufs filesystem maintains a number of free block counts (e.g., per
1489  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1490  * in addition to the bitmaps.  These counts are used to improve efficiency
1491  * during allocation and therefore must be consistent with the bitmaps.
1492  * There is no convenient way to guarantee post-crash consistency of these
1493  * counts with simple update ordering, for two main reasons: (1) The counts
1494  * and bitmaps for a single cylinder group block are not in the same disk
1495  * sector.  If a disk write is interrupted (e.g., by power failure), one may
1496  * be written and the other not.  (2) Some of the counts are located in the
1497  * superblock rather than the cylinder group block. So, we focus our soft
1498  * updates implementation on protecting the bitmaps. When mounting a
1499  * filesystem, we recompute the auxiliary counts from the bitmaps.
1500  */
1501
1502 /*
1503  * Called just after updating the cylinder group block to allocate an inode.
1504  */
1505 void
1506 softdep_setup_inomapdep(bp, ip, newinum)
1507         struct buf *bp;         /* buffer for cylgroup block with inode map */
1508         struct inode *ip;       /* inode related to allocation */
1509         ino_t newinum;          /* new inode number being allocated */
1510 {
1511         struct inodedep *inodedep;
1512         struct bmsafemap *bmsafemap;
1513
1514         /*
1515          * Create a dependency for the newly allocated inode.
1516          * Panic if it already exists as something is seriously wrong.
1517          * Otherwise add it to the dependency list for the buffer holding
1518          * the cylinder group map from which it was allocated.
1519          */
1520         ACQUIRE_LOCK(&lk);
1521         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
1522             &inodedep)))
1523                 panic("softdep_setup_inomapdep: dependency for new inode "
1524                     "already exists");
1525         inodedep->id_buf = bp;
1526         inodedep->id_state &= ~DEPCOMPLETE;
1527         bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
1528         LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1529         FREE_LOCK(&lk);
1530 }
1531
1532 /*
1533  * Called just after updating the cylinder group block to
1534  * allocate block or fragment.
1535  */
1536 void
1537 softdep_setup_blkmapdep(bp, mp, newblkno)
1538         struct buf *bp;         /* buffer for cylgroup block with block map */
1539         struct mount *mp;       /* filesystem doing allocation */
1540         ufs2_daddr_t newblkno;  /* number of newly allocated block */
1541 {
1542         struct newblk *newblk;
1543         struct bmsafemap *bmsafemap;
1544         struct fs *fs;
1545
1546         fs = VFSTOUFS(mp)->um_fs;
1547         /*
1548          * Create a dependency for the newly allocated block.
1549          * Add it to the dependency list for the buffer holding
1550          * the cylinder group map from which it was allocated.
1551          */
1552         ACQUIRE_LOCK(&lk);
1553         if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1554                 panic("softdep_setup_blkmapdep: found block");
1555         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
1556         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1557         FREE_LOCK(&lk);
1558 }
1559
1560 /*
1561  * Find the bmsafemap associated with a cylinder group buffer.
1562  * If none exists, create one. The buffer must be locked when
1563  * this routine is called and this routine must be called with
1564  * splbio interrupts blocked.
1565  */
1566 static struct bmsafemap *
1567 bmsafemap_lookup(mp, bp)
1568         struct mount *mp;
1569         struct buf *bp;
1570 {
1571         struct bmsafemap *bmsafemap;
1572         struct worklist *wk;
1573
1574         mtx_assert(&lk, MA_OWNED);
1575         LIST_FOREACH(wk, &bp->b_dep, wk_list)
1576                 if (wk->wk_type == D_BMSAFEMAP)
1577                         return (WK_BMSAFEMAP(wk));
1578         FREE_LOCK(&lk);
1579         MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1580                 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1581         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
1582         bmsafemap->sm_buf = bp;
1583         LIST_INIT(&bmsafemap->sm_allocdirecthd);
1584         LIST_INIT(&bmsafemap->sm_allocindirhd);
1585         LIST_INIT(&bmsafemap->sm_inodedephd);
1586         LIST_INIT(&bmsafemap->sm_newblkhd);
1587         ACQUIRE_LOCK(&lk);
1588         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1589         return (bmsafemap);
1590 }
1591
1592 /*
1593  * Direct block allocation dependencies.
1594  *
1595  * When a new block is allocated, the corresponding disk locations must be
1596  * initialized (with zeros or new data) before the on-disk inode points to
1597  * them.  Also, the freemap from which the block was allocated must be
1598  * updated (on disk) before the inode's pointer. These two dependencies are
1599  * independent of each other and are needed for all file blocks and indirect
1600  * blocks that are pointed to directly by the inode.  Just before the
1601  * "in-core" version of the inode is updated with a newly allocated block
1602  * number, a procedure (below) is called to setup allocation dependency
1603  * structures.  These structures are removed when the corresponding
1604  * dependencies are satisfied or when the block allocation becomes obsolete
1605  * (i.e., the file is deleted, the block is de-allocated, or the block is a
1606  * fragment that gets upgraded).  All of these cases are handled in
1607  * procedures described later.
1608  *
1609  * When a file extension causes a fragment to be upgraded, either to a larger
1610  * fragment or to a full block, the on-disk location may change (if the
1611  * previous fragment could not simply be extended). In this case, the old
1612  * fragment must be de-allocated, but not until after the inode's pointer has
1613  * been updated. In most cases, this is handled by later procedures, which
1614  * will construct a "freefrag" structure to be added to the workitem queue
1615  * when the inode update is complete (or obsolete).  The main exception to
1616  * this is when an allocation occurs while a pending allocation dependency
1617  * (for the same block pointer) remains.  This case is handled in the main
1618  * allocation dependency setup procedure by immediately freeing the
1619  * unreferenced fragments.
1620  */
1621 void
1622 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1623         struct inode *ip;       /* inode to which block is being added */
1624         ufs_lbn_t lbn;          /* block pointer within inode */
1625         ufs2_daddr_t newblkno;  /* disk block number being added */
1626         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
1627         long newsize;           /* size of new block */
1628         long oldsize;           /* size of new block */
1629         struct buf *bp;         /* bp for allocated block */
1630 {
1631         struct allocdirect *adp, *oldadp;
1632         struct allocdirectlst *adphead;
1633         struct bmsafemap *bmsafemap;
1634         struct inodedep *inodedep;
1635         struct pagedep *pagedep;
1636         struct newblk *newblk;
1637         struct mount *mp;
1638
1639         mp = UFSTOVFS(ip->i_ump);
1640         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1641                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1642         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1643         adp->ad_lbn = lbn;
1644         adp->ad_newblkno = newblkno;
1645         adp->ad_oldblkno = oldblkno;
1646         adp->ad_newsize = newsize;
1647         adp->ad_oldsize = oldsize;
1648         adp->ad_state = ATTACHED;
1649         LIST_INIT(&adp->ad_newdirblk);
1650         if (newblkno == oldblkno)
1651                 adp->ad_freefrag = NULL;
1652         else
1653                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1654
1655         ACQUIRE_LOCK(&lk);
1656         if (lbn >= NDADDR) {
1657                 /* allocating an indirect block */
1658                 if (oldblkno != 0)
1659                         panic("softdep_setup_allocdirect: non-zero indir");
1660         } else {
1661                 /*
1662                  * Allocating a direct block.
1663                  *
1664                  * If we are allocating a directory block, then we must
1665                  * allocate an associated pagedep to track additions and
1666                  * deletions.
1667                  */
1668                 if ((ip->i_mode & IFMT) == IFDIR &&
1669                     pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1670                         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1671         }
1672         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1673                 panic("softdep_setup_allocdirect: lost block");
1674         if (newblk->nb_state == DEPCOMPLETE) {
1675                 adp->ad_state |= DEPCOMPLETE;
1676                 adp->ad_buf = NULL;
1677         } else {
1678                 bmsafemap = newblk->nb_bmsafemap;
1679                 adp->ad_buf = bmsafemap->sm_buf;
1680                 LIST_REMOVE(newblk, nb_deps);
1681                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1682         }
1683         LIST_REMOVE(newblk, nb_hash);
1684         FREE(newblk, M_NEWBLK);
1685
1686         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1687         adp->ad_inodedep = inodedep;
1688         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1689         /*
1690          * The list of allocdirects must be kept in sorted and ascending
1691          * order so that the rollback routines can quickly determine the
1692          * first uncommitted block (the size of the file stored on disk
1693          * ends at the end of the lowest committed fragment, or if there
1694          * are no fragments, at the end of the highest committed block).
1695          * Since files generally grow, the typical case is that the new
1696          * block is to be added at the end of the list. We speed this
1697          * special case by checking against the last allocdirect in the
1698          * list before laboriously traversing the list looking for the
1699          * insertion point.
1700          */
1701         adphead = &inodedep->id_newinoupdt;
1702         oldadp = TAILQ_LAST(adphead, allocdirectlst);
1703         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1704                 /* insert at end of list */
1705                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1706                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1707                         allocdirect_merge(adphead, adp, oldadp);
1708                 FREE_LOCK(&lk);
1709                 return;
1710         }
1711         TAILQ_FOREACH(oldadp, adphead, ad_next) {
1712                 if (oldadp->ad_lbn >= lbn)
1713                         break;
1714         }
1715         if (oldadp == NULL)
1716                 panic("softdep_setup_allocdirect: lost entry");
1717         /* insert in middle of list */
1718         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1719         if (oldadp->ad_lbn == lbn)
1720                 allocdirect_merge(adphead, adp, oldadp);
1721         FREE_LOCK(&lk);
1722 }
1723
1724 /*
1725  * Replace an old allocdirect dependency with a newer one.
1726  * This routine must be called with splbio interrupts blocked.
1727  */
1728 static void
1729 allocdirect_merge(adphead, newadp, oldadp)
1730         struct allocdirectlst *adphead; /* head of list holding allocdirects */
1731         struct allocdirect *newadp;     /* allocdirect being added */
1732         struct allocdirect *oldadp;     /* existing allocdirect being checked */
1733 {
1734         struct worklist *wk;
1735         struct freefrag *freefrag;
1736         struct newdirblk *newdirblk;
1737
1738         mtx_assert(&lk, MA_OWNED);
1739         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1740             newadp->ad_oldsize != oldadp->ad_newsize ||
1741             newadp->ad_lbn >= NDADDR)
1742                 panic("%s %jd != new %jd || old size %ld != new %ld",
1743                     "allocdirect_merge: old blkno",
1744                     (intmax_t)newadp->ad_oldblkno,
1745                     (intmax_t)oldadp->ad_newblkno,
1746                     newadp->ad_oldsize, oldadp->ad_newsize);
1747         newadp->ad_oldblkno = oldadp->ad_oldblkno;
1748         newadp->ad_oldsize = oldadp->ad_oldsize;
1749         /*
1750          * If the old dependency had a fragment to free or had never
1751          * previously had a block allocated, then the new dependency
1752          * can immediately post its freefrag and adopt the old freefrag.
1753          * This action is done by swapping the freefrag dependencies.
1754          * The new dependency gains the old one's freefrag, and the
1755          * old one gets the new one and then immediately puts it on
1756          * the worklist when it is freed by free_allocdirect. It is
1757          * not possible to do this swap when the old dependency had a
1758          * non-zero size but no previous fragment to free. This condition
1759          * arises when the new block is an extension of the old block.
1760          * Here, the first part of the fragment allocated to the new
1761          * dependency is part of the block currently claimed on disk by
1762          * the old dependency, so cannot legitimately be freed until the
1763          * conditions for the new dependency are fulfilled.
1764          */
1765         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1766                 freefrag = newadp->ad_freefrag;
1767                 newadp->ad_freefrag = oldadp->ad_freefrag;
1768                 oldadp->ad_freefrag = freefrag;
1769         }
1770         /*
1771          * If we are tracking a new directory-block allocation,
1772          * move it from the old allocdirect to the new allocdirect.
1773          */
1774         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1775                 newdirblk = WK_NEWDIRBLK(wk);
1776                 WORKLIST_REMOVE(&newdirblk->db_list);
1777                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
1778                         panic("allocdirect_merge: extra newdirblk");
1779                 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1780         }
1781         free_allocdirect(adphead, oldadp, 0);
1782 }
1783
1784 /*
1785  * Allocate a new freefrag structure if needed.
1786  */
1787 static struct freefrag *
1788 newfreefrag(ip, blkno, size)
1789         struct inode *ip;
1790         ufs2_daddr_t blkno;
1791         long size;
1792 {
1793         struct freefrag *freefrag;
1794         struct fs *fs;
1795
1796         if (blkno == 0)
1797                 return (NULL);
1798         fs = ip->i_fs;
1799         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1800                 panic("newfreefrag: frag size");
1801         MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1802                 M_FREEFRAG, M_SOFTDEP_FLAGS);
1803         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
1804         freefrag->ff_inum = ip->i_number;
1805         freefrag->ff_blkno = blkno;
1806         freefrag->ff_fragsize = size;
1807         return (freefrag);
1808 }
1809
1810 /*
1811  * This workitem de-allocates fragments that were replaced during
1812  * file block allocation.
1813  */
1814 static void
1815 handle_workitem_freefrag(freefrag)
1816         struct freefrag *freefrag;
1817 {
1818         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
1819
1820         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1821             freefrag->ff_fragsize, freefrag->ff_inum);
1822         ACQUIRE_LOCK(&lk);
1823         WORKITEM_FREE(freefrag, D_FREEFRAG);
1824         FREE_LOCK(&lk);
1825 }
1826
1827 /*
1828  * Set up a dependency structure for an external attributes data block.
1829  * This routine follows much of the structure of softdep_setup_allocdirect.
1830  * See the description of softdep_setup_allocdirect above for details.
1831  */
1832 void
1833 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1834         struct inode *ip;
1835         ufs_lbn_t lbn;
1836         ufs2_daddr_t newblkno;
1837         ufs2_daddr_t oldblkno;
1838         long newsize;
1839         long oldsize;
1840         struct buf *bp;
1841 {
1842         struct allocdirect *adp, *oldadp;
1843         struct allocdirectlst *adphead;
1844         struct bmsafemap *bmsafemap;
1845         struct inodedep *inodedep;
1846         struct newblk *newblk;
1847         struct mount *mp;
1848
1849         mp = UFSTOVFS(ip->i_ump);
1850         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1851                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1852         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1853         adp->ad_lbn = lbn;
1854         adp->ad_newblkno = newblkno;
1855         adp->ad_oldblkno = oldblkno;
1856         adp->ad_newsize = newsize;
1857         adp->ad_oldsize = oldsize;
1858         adp->ad_state = ATTACHED | EXTDATA;
1859         LIST_INIT(&adp->ad_newdirblk);
1860         if (newblkno == oldblkno)
1861                 adp->ad_freefrag = NULL;
1862         else
1863                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1864
1865         ACQUIRE_LOCK(&lk);
1866         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1867                 panic("softdep_setup_allocext: lost block");
1868
1869         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1870         adp->ad_inodedep = inodedep;
1871
1872         if (newblk->nb_state == DEPCOMPLETE) {
1873                 adp->ad_state |= DEPCOMPLETE;
1874                 adp->ad_buf = NULL;
1875         } else {
1876                 bmsafemap = newblk->nb_bmsafemap;
1877                 adp->ad_buf = bmsafemap->sm_buf;
1878                 LIST_REMOVE(newblk, nb_deps);
1879                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1880         }
1881         LIST_REMOVE(newblk, nb_hash);
1882         FREE(newblk, M_NEWBLK);
1883
1884         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1885         if (lbn >= NXADDR)
1886                 panic("softdep_setup_allocext: lbn %lld > NXADDR",
1887                     (long long)lbn);
1888         /*
1889          * The list of allocdirects must be kept in sorted and ascending
1890          * order so that the rollback routines can quickly determine the
1891          * first uncommitted block (the size of the file stored on disk
1892          * ends at the end of the lowest committed fragment, or if there
1893          * are no fragments, at the end of the highest committed block).
1894          * Since files generally grow, the typical case is that the new
1895          * block is to be added at the end of the list. We speed this
1896          * special case by checking against the last allocdirect in the
1897          * list before laboriously traversing the list looking for the
1898          * insertion point.
1899          */
1900         adphead = &inodedep->id_newextupdt;
1901         oldadp = TAILQ_LAST(adphead, allocdirectlst);
1902         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1903                 /* insert at end of list */
1904                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1905                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1906                         allocdirect_merge(adphead, adp, oldadp);
1907                 FREE_LOCK(&lk);
1908                 return;
1909         }
1910         TAILQ_FOREACH(oldadp, adphead, ad_next) {
1911                 if (oldadp->ad_lbn >= lbn)
1912                         break;
1913         }
1914         if (oldadp == NULL)
1915                 panic("softdep_setup_allocext: lost entry");
1916         /* insert in middle of list */
1917         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1918         if (oldadp->ad_lbn == lbn)
1919                 allocdirect_merge(adphead, adp, oldadp);
1920         FREE_LOCK(&lk);
1921 }
1922
1923 /*
1924  * Indirect block allocation dependencies.
1925  *
1926  * The same dependencies that exist for a direct block also exist when
1927  * a new block is allocated and pointed to by an entry in a block of
1928  * indirect pointers. The undo/redo states described above are also
1929  * used here. Because an indirect block contains many pointers that
1930  * may have dependencies, a second copy of the entire in-memory indirect
1931  * block is kept. The buffer cache copy is always completely up-to-date.
1932  * The second copy, which is used only as a source for disk writes,
1933  * contains only the safe pointers (i.e., those that have no remaining
1934  * update dependencies). The second copy is freed when all pointers
1935  * are safe. The cache is not allowed to replace indirect blocks with
1936  * pending update dependencies. If a buffer containing an indirect
1937  * block with dependencies is written, these routines will mark it
1938  * dirty again. It can only be successfully written once all the
1939  * dependencies are removed. The ffs_fsync routine in conjunction with
1940  * softdep_sync_metadata work together to get all the dependencies
1941  * removed so that a file can be successfully written to disk. Three
1942  * procedures are used when setting up indirect block pointer
1943  * dependencies. The division is necessary because of the organization
1944  * of the "balloc" routine and because of the distinction between file
1945  * pages and file metadata blocks.
1946  */
1947
1948 /*
1949  * Allocate a new allocindir structure.
1950  */
1951 static struct allocindir *
1952 newallocindir(ip, ptrno, newblkno, oldblkno)
1953         struct inode *ip;       /* inode for file being extended */
1954         int ptrno;              /* offset of pointer in indirect block */
1955         ufs2_daddr_t newblkno;  /* disk block number being added */
1956         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
1957 {
1958         struct allocindir *aip;
1959
1960         MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1961                 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1962         workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
1963         aip->ai_state = ATTACHED;
1964         aip->ai_offset = ptrno;
1965         aip->ai_newblkno = newblkno;
1966         aip->ai_oldblkno = oldblkno;
1967         aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1968         return (aip);
1969 }
1970
1971 /*
1972  * Called just before setting an indirect block pointer
1973  * to a newly allocated file page.
1974  */
1975 void
1976 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1977         struct inode *ip;       /* inode for file being extended */
1978         ufs_lbn_t lbn;          /* allocated block number within file */
1979         struct buf *bp;         /* buffer with indirect blk referencing page */
1980         int ptrno;              /* offset of pointer in indirect block */
1981         ufs2_daddr_t newblkno;  /* disk block number being added */
1982         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
1983         struct buf *nbp;        /* buffer holding allocated page */
1984 {
1985         struct allocindir *aip;
1986         struct pagedep *pagedep;
1987
1988         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
1989         aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1990         ACQUIRE_LOCK(&lk);
1991         /*
1992          * If we are allocating a directory page, then we must
1993          * allocate an associated pagedep to track additions and
1994          * deletions.
1995          */
1996         if ((ip->i_mode & IFMT) == IFDIR &&
1997             pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1998                 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1999         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2000         setup_allocindir_phase2(bp, ip, aip);
2001         FREE_LOCK(&lk);
2002 }
2003
2004 /*
2005  * Called just before setting an indirect block pointer to a
2006  * newly allocated indirect block.
2007  */
2008 void
2009 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
2010         struct buf *nbp;        /* newly allocated indirect block */
2011         struct inode *ip;       /* inode for file being extended */
2012         struct buf *bp;         /* indirect block referencing allocated block */
2013         int ptrno;              /* offset of pointer in indirect block */
2014         ufs2_daddr_t newblkno;  /* disk block number being added */
2015 {
2016         struct allocindir *aip;
2017
2018         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
2019         aip = newallocindir(ip, ptrno, newblkno, 0);
2020         ACQUIRE_LOCK(&lk);
2021         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2022         setup_allocindir_phase2(bp, ip, aip);
2023         FREE_LOCK(&lk);
2024 }
2025
2026 /*
2027  * Called to finish the allocation of the "aip" allocated
2028  * by one of the two routines above.
2029  */
2030 static void
2031 setup_allocindir_phase2(bp, ip, aip)
2032         struct buf *bp;         /* in-memory copy of the indirect block */
2033         struct inode *ip;       /* inode for file being extended */
2034         struct allocindir *aip; /* allocindir allocated by the above routines */
2035 {
2036         struct worklist *wk;
2037         struct indirdep *indirdep, *newindirdep;
2038         struct bmsafemap *bmsafemap;
2039         struct allocindir *oldaip;
2040         struct freefrag *freefrag;
2041         struct newblk *newblk;
2042         ufs2_daddr_t blkno;
2043
2044         mtx_assert(&lk, MA_OWNED);
2045         if (bp->b_lblkno >= 0)
2046                 panic("setup_allocindir_phase2: not indir blk");
2047         for (indirdep = NULL, newindirdep = NULL; ; ) {
2048                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2049                         if (wk->wk_type != D_INDIRDEP)
2050                                 continue;
2051                         indirdep = WK_INDIRDEP(wk);
2052                         break;
2053                 }
2054                 if (indirdep == NULL && newindirdep) {
2055                         indirdep = newindirdep;
2056                         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
2057                         newindirdep = NULL;
2058                 }
2059                 if (indirdep) {
2060                         if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
2061                             &newblk) == 0)
2062                                 panic("setup_allocindir: lost block");
2063                         if (newblk->nb_state == DEPCOMPLETE) {
2064                                 aip->ai_state |= DEPCOMPLETE;
2065                                 aip->ai_buf = NULL;
2066                         } else {
2067                                 bmsafemap = newblk->nb_bmsafemap;
2068                                 aip->ai_buf = bmsafemap->sm_buf;
2069                                 LIST_REMOVE(newblk, nb_deps);
2070                                 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
2071                                     aip, ai_deps);
2072                         }
2073                         LIST_REMOVE(newblk, nb_hash);
2074                         FREE(newblk, M_NEWBLK);
2075                         aip->ai_indirdep = indirdep;
2076                         /*
2077                          * Check to see if there is an existing dependency
2078                          * for this block. If there is, merge the old
2079                          * dependency into the new one.
2080                          */
2081                         if (aip->ai_oldblkno == 0)
2082                                 oldaip = NULL;
2083                         else
2084
2085                                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
2086                                         if (oldaip->ai_offset == aip->ai_offset)
2087                                                 break;
2088                         freefrag = NULL;
2089                         if (oldaip != NULL) {
2090                                 if (oldaip->ai_newblkno != aip->ai_oldblkno)
2091                                         panic("setup_allocindir_phase2: blkno");
2092                                 aip->ai_oldblkno = oldaip->ai_oldblkno;
2093                                 freefrag = aip->ai_freefrag;
2094                                 aip->ai_freefrag = oldaip->ai_freefrag;
2095                                 oldaip->ai_freefrag = NULL;
2096                                 free_allocindir(oldaip, NULL);
2097                         }
2098                         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
2099                         if (ip->i_ump->um_fstype == UFS1)
2100                                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
2101                                     [aip->ai_offset] = aip->ai_oldblkno;
2102                         else
2103                                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
2104                                     [aip->ai_offset] = aip->ai_oldblkno;
2105                         FREE_LOCK(&lk);
2106                         if (freefrag != NULL)
2107                                 handle_workitem_freefrag(freefrag);
2108                 } else
2109                         FREE_LOCK(&lk);
2110                 if (newindirdep) {
2111                         newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2112                         brelse(newindirdep->ir_savebp);
2113                         ACQUIRE_LOCK(&lk);
2114                         WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
2115                         if (indirdep)
2116                                 break;
2117                         FREE_LOCK(&lk);
2118                 }
2119                 if (indirdep) {
2120                         ACQUIRE_LOCK(&lk);
2121                         break;
2122                 }
2123                 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
2124                         M_INDIRDEP, M_SOFTDEP_FLAGS);
2125                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
2126                     UFSTOVFS(ip->i_ump));
2127                 newindirdep->ir_state = ATTACHED;
2128                 if (ip->i_ump->um_fstype == UFS1)
2129                         newindirdep->ir_state |= UFS1FMT;
2130                 LIST_INIT(&newindirdep->ir_deplisthd);
2131                 LIST_INIT(&newindirdep->ir_donehd);
2132                 if (bp->b_blkno == bp->b_lblkno) {
2133                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
2134                             NULL, NULL);
2135                         bp->b_blkno = blkno;
2136                 }
2137                 newindirdep->ir_savebp =
2138                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
2139                 BUF_KERNPROC(newindirdep->ir_savebp);
2140                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
2141                 ACQUIRE_LOCK(&lk);
2142         }
2143 }
2144
2145 /*
2146  * Block de-allocation dependencies.
2147  *
2148  * When blocks are de-allocated, the on-disk pointers must be nullified before
2149  * the blocks are made available for use by other files.  (The true
2150  * requirement is that old pointers must be nullified before new on-disk
2151  * pointers are set.  We chose this slightly more stringent requirement to
2152  * reduce complexity.) Our implementation handles this dependency by updating
2153  * the inode (or indirect block) appropriately but delaying the actual block
2154  * de-allocation (i.e., freemap and free space count manipulation) until
2155  * after the updated versions reach stable storage.  After the disk is
2156  * updated, the blocks can be safely de-allocated whenever it is convenient.
2157  * This implementation handles only the common case of reducing a file's
2158  * length to zero. Other cases are handled by the conventional synchronous
2159  * write approach.
2160  *
2161  * The ffs implementation with which we worked double-checks
2162  * the state of the block pointers and file size as it reduces
2163  * a file's length.  Some of this code is replicated here in our
2164  * soft updates implementation.  The freeblks->fb_chkcnt field is
2165  * used to transfer a part of this information to the procedure
2166  * that eventually de-allocates the blocks.
2167  *
2168  * This routine should be called from the routine that shortens
2169  * a file's length, before the inode's size or block pointers
2170  * are modified. It will save the block pointer information for
2171  * later release and zero the inode so that the calling routine
2172  * can release it.
2173  */
2174 void
2175 softdep_setup_freeblocks(ip, length, flags)
2176         struct inode *ip;       /* The inode whose length is to be reduced */
2177         off_t length;           /* The new length for the file */
2178         int flags;              /* IO_EXT and/or IO_NORMAL */
2179 {
2180         struct freeblks *freeblks;
2181         struct inodedep *inodedep;
2182         struct allocdirect *adp;
2183         struct bufobj *bo;
2184         struct vnode *vp;
2185         struct buf *bp;
2186         struct fs *fs;
2187         ufs2_daddr_t extblocks, datablocks;
2188         struct mount *mp;
2189         int i, delay, error;
2190
2191         fs = ip->i_fs;
2192         mp = UFSTOVFS(ip->i_ump);
2193         if (length != 0)
2194                 panic("softdep_setup_freeblocks: non-zero length");
2195         MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
2196                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
2197         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
2198         freeblks->fb_state = ATTACHED;
2199         freeblks->fb_uid = ip->i_uid;
2200         freeblks->fb_previousinum = ip->i_number;
2201         freeblks->fb_devvp = ip->i_devvp;
2202         extblocks = 0;
2203         if (fs->fs_magic == FS_UFS2_MAGIC)
2204                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
2205         datablocks = DIP(ip, i_blocks) - extblocks;
2206         if ((flags & IO_NORMAL) == 0) {
2207                 freeblks->fb_oldsize = 0;
2208                 freeblks->fb_chkcnt = 0;
2209         } else {
2210                 freeblks->fb_oldsize = ip->i_size;
2211                 ip->i_size = 0;
2212                 DIP_SET(ip, i_size, 0);
2213                 freeblks->fb_chkcnt = datablocks;
2214                 for (i = 0; i < NDADDR; i++) {
2215                         freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
2216                         DIP_SET(ip, i_db[i], 0);
2217                 }
2218                 for (i = 0; i < NIADDR; i++) {
2219                         freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
2220                         DIP_SET(ip, i_ib[i], 0);
2221                 }
2222                 /*
2223                  * If the file was removed, then the space being freed was
2224                  * accounted for then (see softdep_releasefile()). If the
2225                  * file is merely being truncated, then we account for it now.
2226                  */
2227                 if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2228                         UFS_LOCK(ip->i_ump);
2229                         fs->fs_pendingblocks += datablocks;
2230                         UFS_UNLOCK(ip->i_ump);
2231                 }
2232         }
2233         if ((flags & IO_EXT) == 0) {
2234                 freeblks->fb_oldextsize = 0;
2235         } else {
2236                 freeblks->fb_oldextsize = ip->i_din2->di_extsize;
2237                 ip->i_din2->di_extsize = 0;
2238                 freeblks->fb_chkcnt += extblocks;
2239                 for (i = 0; i < NXADDR; i++) {
2240                         freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
2241                         ip->i_din2->di_extb[i] = 0;
2242                 }
2243         }
2244         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
2245         /*
2246          * Push the zero'ed inode to to its disk buffer so that we are free
2247          * to delete its dependencies below. Once the dependencies are gone
2248          * the buffer can be safely released.
2249          */
2250         if ((error = bread(ip->i_devvp,
2251             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
2252             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
2253                 brelse(bp);
2254                 softdep_error("softdep_setup_freeblocks", error);
2255         }
2256         if (ip->i_ump->um_fstype == UFS1)
2257                 *((struct ufs1_dinode *)bp->b_data +
2258                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
2259         else
2260                 *((struct ufs2_dinode *)bp->b_data +
2261                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
2262         /*
2263          * Find and eliminate any inode dependencies.
2264          */
2265         ACQUIRE_LOCK(&lk);
2266         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
2267         if ((inodedep->id_state & IOSTARTED) != 0)
2268                 panic("softdep_setup_freeblocks: inode busy");
2269         /*
2270          * Add the freeblks structure to the list of operations that
2271          * must await the zero'ed inode being written to disk. If we
2272          * still have a bitmap dependency (delay == 0), then the inode
2273          * has never been written to disk, so we can process the
2274          * freeblks below once we have deleted the dependencies.
2275          */
2276         delay = (inodedep->id_state & DEPCOMPLETE);
2277         if (delay)
2278                 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
2279         /*
2280          * Because the file length has been truncated to zero, any
2281          * pending block allocation dependency structures associated
2282          * with this inode are obsolete and can simply be de-allocated.
2283          * We must first merge the two dependency lists to get rid of
2284          * any duplicate freefrag structures, then purge the merged list.
2285          * If we still have a bitmap dependency, then the inode has never
2286          * been written to disk, so we can free any fragments without delay.
2287          */
2288         if (flags & IO_NORMAL) {
2289                 merge_inode_lists(&inodedep->id_newinoupdt,
2290                     &inodedep->id_inoupdt);
2291                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2292                         free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2293         }
2294         if (flags & IO_EXT) {
2295                 merge_inode_lists(&inodedep->id_newextupdt,
2296                     &inodedep->id_extupdt);
2297                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
2298                         free_allocdirect(&inodedep->id_extupdt, adp, delay);
2299         }
2300         FREE_LOCK(&lk);
2301         bdwrite(bp);
2302         /*
2303          * We must wait for any I/O in progress to finish so that
2304          * all potential buffers on the dirty list will be visible.
2305          * Once they are all there, walk the list and get rid of
2306          * any dependencies.
2307          */
2308         vp = ITOV(ip);
2309         bo = &vp->v_bufobj;
2310         BO_LOCK(bo);
2311         drain_output(vp);
2312 restart:
2313         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
2314                 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
2315                     ((flags & IO_NORMAL) == 0 &&
2316                       (bp->b_xflags & BX_ALTDATA) == 0))
2317                         continue;
2318                 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL)
2319                         goto restart;
2320                 BO_UNLOCK(bo);
2321                 ACQUIRE_LOCK(&lk);
2322                 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
2323                 deallocate_dependencies(bp, inodedep);
2324                 FREE_LOCK(&lk);
2325                 bp->b_flags |= B_INVAL | B_NOCACHE;
2326                 brelse(bp);
2327                 BO_LOCK(bo);
2328                 goto restart;
2329         }
2330         BO_UNLOCK(bo);
2331         ACQUIRE_LOCK(&lk);
2332         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
2333                 (void) free_inodedep(inodedep);
2334
2335         if(delay) {
2336                 freeblks->fb_state |= DEPCOMPLETE;
2337                 /*
2338                  * If the inode with zeroed block pointers is now on disk
2339                  * we can start freeing blocks. Add freeblks to the worklist
2340                  * instead of calling  handle_workitem_freeblocks directly as
2341                  * it is more likely that additional IO is needed to complete
2342                  * the request here than in the !delay case.
2343                  */
2344                 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2345                         add_to_worklist(&freeblks->fb_list);
2346         }
2347
2348         FREE_LOCK(&lk);
2349         /*
2350          * If the inode has never been written to disk (delay == 0),
2351          * then we can process the freeblks now that we have deleted
2352          * the dependencies.
2353          */
2354         if (!delay)
2355                 handle_workitem_freeblocks(freeblks, 0);
2356 }
2357
2358 /*
2359  * Reclaim any dependency structures from a buffer that is about to
2360  * be reallocated to a new vnode. The buffer must be locked, thus,
2361  * no I/O completion operations can occur while we are manipulating
2362  * its associated dependencies. The mutex is held so that other I/O's
2363  * associated with related dependencies do not occur.
2364  */
2365 static void
2366 deallocate_dependencies(bp, inodedep)
2367         struct buf *bp;
2368         struct inodedep *inodedep;
2369 {
2370         struct worklist *wk;
2371         struct indirdep *indirdep;
2372         struct allocindir *aip;
2373         struct pagedep *pagedep;
2374         struct dirrem *dirrem;
2375         struct diradd *dap;
2376         int i;
2377
2378         mtx_assert(&lk, MA_OWNED);
2379         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2380                 switch (wk->wk_type) {
2381
2382                 case D_INDIRDEP:
2383                         indirdep = WK_INDIRDEP(wk);
2384                         /*
2385                          * None of the indirect pointers will ever be visible,
2386                          * so they can simply be tossed. GOINGAWAY ensures
2387                          * that allocated pointers will be saved in the buffer
2388                          * cache until they are freed. Note that they will
2389                          * only be able to be found by their physical address
2390                          * since the inode mapping the logical address will
2391                          * be gone. The save buffer used for the safe copy
2392                          * was allocated in setup_allocindir_phase2 using
2393                          * the physical address so it could be used for this
2394                          * purpose. Hence we swap the safe copy with the real
2395                          * copy, allowing the safe copy to be freed and holding
2396                          * on to the real copy for later use in indir_trunc.
2397                          */
2398                         if (indirdep->ir_state & GOINGAWAY)
2399                                 panic("deallocate_dependencies: already gone");
2400                         indirdep->ir_state |= GOINGAWAY;
2401                         VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
2402                         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2403                                 free_allocindir(aip, inodedep);
2404                         if (bp->b_lblkno >= 0 ||
2405                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
2406                                 panic("deallocate_dependencies: not indir");
2407                         bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2408                             bp->b_bcount);
2409                         WORKLIST_REMOVE(wk);
2410                         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2411                         continue;
2412
2413                 case D_PAGEDEP:
2414                         pagedep = WK_PAGEDEP(wk);
2415                         /*
2416                          * None of the directory additions will ever be
2417                          * visible, so they can simply be tossed.
2418                          */
2419                         for (i = 0; i < DAHASHSZ; i++)
2420                                 while ((dap =
2421                                     LIST_FIRST(&pagedep->pd_diraddhd[i])))
2422                                         free_diradd(dap);
2423                         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2424                                 free_diradd(dap);
2425                         /*
2426                          * Copy any directory remove dependencies to the list
2427                          * to be processed after the zero'ed inode is written.
2428                          * If the inode has already been written, then they
2429                          * can be dumped directly onto the work list.
2430                          */
2431                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
2432                                 LIST_REMOVE(dirrem, dm_next);
2433                                 dirrem->dm_dirinum = pagedep->pd_ino;
2434                                 if (inodedep == NULL ||
2435                                     (inodedep->id_state & ALLCOMPLETE) ==
2436                                      ALLCOMPLETE)
2437                                         add_to_worklist(&dirrem->dm_list);
2438                                 else
2439                                         WORKLIST_INSERT(&inodedep->id_bufwait,
2440                                             &dirrem->dm_list);
2441                         }
2442                         if ((pagedep->pd_state & NEWBLOCK) != 0) {
2443                                 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2444                                         if (wk->wk_type == D_NEWDIRBLK &&
2445                                             WK_NEWDIRBLK(wk)->db_pagedep ==
2446                                               pagedep)
2447                                                 break;
2448                                 if (wk != NULL) {
2449                                         WORKLIST_REMOVE(wk);
2450                                         free_newdirblk(WK_NEWDIRBLK(wk));
2451                                 } else
2452                                         panic("deallocate_dependencies: "
2453                                               "lost pagedep");
2454                         }
2455                         WORKLIST_REMOVE(&pagedep->pd_list);
2456                         LIST_REMOVE(pagedep, pd_hash);
2457                         WORKITEM_FREE(pagedep, D_PAGEDEP);
2458                         continue;
2459
2460                 case D_ALLOCINDIR:
2461                         free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2462                         continue;
2463
2464                 case D_ALLOCDIRECT:
2465                 case D_INODEDEP:
2466                         panic("deallocate_dependencies: Unexpected type %s",
2467                             TYPENAME(wk->wk_type));
2468                         /* NOTREACHED */
2469
2470                 default:
2471                         panic("deallocate_dependencies: Unknown type %s",
2472                             TYPENAME(wk->wk_type));
2473                         /* NOTREACHED */
2474                 }
2475         }
2476 }
2477
2478 /*
2479  * Free an allocdirect. Generate a new freefrag work request if appropriate.
2480  * This routine must be called with splbio interrupts blocked.
2481  */
2482 static void
2483 free_allocdirect(adphead, adp, delay)
2484         struct allocdirectlst *adphead;
2485         struct allocdirect *adp;
2486         int delay;
2487 {
2488         struct newdirblk *newdirblk;
2489         struct worklist *wk;
2490
2491         mtx_assert(&lk, MA_OWNED);
2492         if ((adp->ad_state & DEPCOMPLETE) == 0)
2493                 LIST_REMOVE(adp, ad_deps);
2494         TAILQ_REMOVE(adphead, adp, ad_next);
2495         if ((adp->ad_state & COMPLETE) == 0)
2496                 WORKLIST_REMOVE(&adp->ad_list);
2497         if (adp->ad_freefrag != NULL) {
2498                 if (delay)
2499                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2500                             &adp->ad_freefrag->ff_list);
2501                 else
2502                         add_to_worklist(&adp->ad_freefrag->ff_list);
2503         }
2504         if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2505                 newdirblk = WK_NEWDIRBLK(wk);
2506                 WORKLIST_REMOVE(&newdirblk->db_list);
2507                 if (!LIST_EMPTY(&adp->ad_newdirblk))
2508                         panic("free_allocdirect: extra newdirblk");
2509                 if (delay)
2510                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2511                             &newdirblk->db_list);
2512                 else
2513                         free_newdirblk(newdirblk);
2514         }
2515         WORKITEM_FREE(adp, D_ALLOCDIRECT);
2516 }
2517
2518 /*
2519  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2520  * This routine must be called with splbio interrupts blocked.
2521  */
2522 static void
2523 free_newdirblk(newdirblk)
2524         struct newdirblk *newdirblk;
2525 {
2526         struct pagedep *pagedep;
2527         struct diradd *dap;
2528         int i;
2529
2530         mtx_assert(&lk, MA_OWNED);
2531         /*
2532          * If the pagedep is still linked onto the directory buffer
2533          * dependency chain, then some of the entries on the
2534          * pd_pendinghd list may not be committed to disk yet. In
2535          * this case, we will simply clear the NEWBLOCK flag and
2536          * let the pd_pendinghd list be processed when the pagedep
2537          * is next written. If the pagedep is no longer on the buffer
2538          * dependency chain, then all the entries on the pd_pending
2539          * list are committed to disk and we can free them here.
2540          */
2541         pagedep = newdirblk->db_pagedep;
2542         pagedep->pd_state &= ~NEWBLOCK;
2543         if ((pagedep->pd_state & ONWORKLIST) == 0)
2544                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2545                         free_diradd(dap);
2546         /*
2547          * If no dependencies remain, the pagedep will be freed.
2548          */
2549         for (i = 0; i < DAHASHSZ; i++)
2550                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
2551                         break;
2552         if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2553                 LIST_REMOVE(pagedep, pd_hash);
2554                 WORKITEM_FREE(pagedep, D_PAGEDEP);
2555         }
2556         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2557 }
2558
2559 /*
2560  * Prepare an inode to be freed. The actual free operation is not
2561  * done until the zero'ed inode has been written to disk.
2562  */
2563 void
2564 softdep_freefile(pvp, ino, mode)
2565         struct vnode *pvp;
2566         ino_t ino;
2567         int mode;
2568 {
2569         struct inode *ip = VTOI(pvp);
2570         struct inodedep *inodedep;
2571         struct freefile *freefile;
2572
2573         /*
2574          * This sets up the inode de-allocation dependency.
2575          */
2576         MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2577                 M_FREEFILE, M_SOFTDEP_FLAGS);
2578         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
2579         freefile->fx_mode = mode;
2580         freefile->fx_oldinum = ino;
2581         freefile->fx_devvp = ip->i_devvp;
2582         if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2583                 UFS_LOCK(ip->i_ump);
2584                 ip->i_fs->fs_pendinginodes += 1;
2585                 UFS_UNLOCK(ip->i_ump);
2586         }
2587
2588         /*
2589          * If the inodedep does not exist, then the zero'ed inode has
2590          * been written to disk. If the allocated inode has never been
2591          * written to disk, then the on-disk inode is zero'ed. In either
2592          * case we can free the file immediately.
2593          */
2594         ACQUIRE_LOCK(&lk);
2595         if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
2596             check_inode_unwritten(inodedep)) {
2597                 FREE_LOCK(&lk);
2598                 handle_workitem_freefile(freefile);
2599                 return;
2600         }
2601         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2602         FREE_LOCK(&lk);
2603         ip->i_flag |= IN_MODIFIED;
2604 }
2605
2606 /*
2607  * Check to see if an inode has never been written to disk. If
2608  * so free the inodedep and return success, otherwise return failure.
2609  * This routine must be called with splbio interrupts blocked.
2610  *
2611  * If we still have a bitmap dependency, then the inode has never
2612  * been written to disk. Drop the dependency as it is no longer
2613  * necessary since the inode is being deallocated. We set the
2614  * ALLCOMPLETE flags since the bitmap now properly shows that the
2615  * inode is not allocated. Even if the inode is actively being
2616  * written, it has been rolled back to its zero'ed state, so we
2617  * are ensured that a zero inode is what is on the disk. For short
2618  * lived files, this change will usually result in removing all the
2619  * dependencies from the inode so that it can be freed immediately.
2620  */
2621 static int
2622 check_inode_unwritten(inodedep)
2623         struct inodedep *inodedep;
2624 {
2625
2626         mtx_assert(&lk, MA_OWNED);
2627         if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2628             !LIST_EMPTY(&inodedep->id_pendinghd) ||
2629             !LIST_EMPTY(&inodedep->id_bufwait) ||
2630             !LIST_EMPTY(&inodedep->id_inowait) ||
2631             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2632             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2633             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
2634             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2635             inodedep->id_nlinkdelta != 0)
2636                 return (0);
2637
2638         /*
2639          * Another process might be in initiate_write_inodeblock_ufs[12]
2640          * trying to allocate memory without holding "Softdep Lock".
2641          */
2642         if ((inodedep->id_state & IOSTARTED) != 0 &&
2643             inodedep->id_savedino1 == NULL)
2644                 return (0);
2645
2646         inodedep->id_state |= ALLCOMPLETE;
2647         LIST_REMOVE(inodedep, id_deps);
2648         inodedep->id_buf = NULL;
2649         if (inodedep->id_state & ONWORKLIST)
2650                 WORKLIST_REMOVE(&inodedep->id_list);
2651         if (inodedep->id_savedino1 != NULL) {
2652                 FREE(inodedep->id_savedino1, M_SAVEDINO);
2653                 inodedep->id_savedino1 = NULL;
2654         }
2655         if (free_inodedep(inodedep) == 0)
2656                 panic("check_inode_unwritten: busy inode");
2657         return (1);
2658 }
2659
2660 /*
2661  * Try to free an inodedep structure. Return 1 if it could be freed.
2662  */
2663 static int
2664 free_inodedep(inodedep)
2665         struct inodedep *inodedep;
2666 {
2667
2668         mtx_assert(&lk, MA_OWNED);
2669         if ((inodedep->id_state & ONWORKLIST) != 0 ||
2670             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2671             !LIST_EMPTY(&inodedep->id_pendinghd) ||
2672             !LIST_EMPTY(&inodedep->id_bufwait) ||
2673             !LIST_EMPTY(&inodedep->id_inowait) ||
2674             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2675             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2676             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
2677             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2678             inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2679                 return (0);
2680         LIST_REMOVE(inodedep, id_hash);
2681         WORKITEM_FREE(inodedep, D_INODEDEP);
2682         num_inodedep -= 1;
2683         return (1);
2684 }
2685
2686 /*
2687  * This workitem routine performs the block de-allocation.
2688  * The workitem is added to the pending list after the updated
2689  * inode block has been written to disk.  As mentioned above,
2690  * checks regarding the number of blocks de-allocated (compared
2691  * to the number of blocks allocated for the file) are also
2692  * performed in this function.
2693  */
2694 static void
2695 handle_workitem_freeblocks(freeblks, flags)
2696         struct freeblks *freeblks;
2697         int flags;
2698 {
2699         struct inode *ip;
2700         struct vnode *vp;
2701         struct fs *fs;
2702         struct ufsmount *ump;
2703         int i, nblocks, level, bsize;
2704         ufs2_daddr_t bn, blocksreleased = 0;
2705         int error, allerror = 0;
2706         ufs_lbn_t baselbns[NIADDR], tmpval;
2707         int fs_pendingblocks;
2708
2709         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2710         fs = ump->um_fs;
2711         fs_pendingblocks = 0;
2712         tmpval = 1;
2713         baselbns[0] = NDADDR;
2714         for (i = 1; i < NIADDR; i++) {
2715                 tmpval *= NINDIR(fs);
2716                 baselbns[i] = baselbns[i - 1] + tmpval;
2717         }
2718         nblocks = btodb(fs->fs_bsize);
2719         blocksreleased = 0;
2720         /*
2721          * Release all extended attribute blocks or frags.
2722          */
2723         if (freeblks->fb_oldextsize > 0) {
2724                 for (i = (NXADDR - 1); i >= 0; i--) {
2725                         if ((bn = freeblks->fb_eblks[i]) == 0)
2726                                 continue;
2727                         bsize = sblksize(fs, freeblks->fb_oldextsize, i);
2728                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2729                             freeblks->fb_previousinum);
2730                         blocksreleased += btodb(bsize);
2731                 }
2732         }
2733         /*
2734          * Release all data blocks or frags.
2735          */
2736         if (freeblks->fb_oldsize > 0) {
2737                 /*
2738                  * Indirect blocks first.
2739                  */
2740                 for (level = (NIADDR - 1); level >= 0; level--) {
2741                         if ((bn = freeblks->fb_iblks[level]) == 0)
2742                                 continue;
2743                         if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
2744                             level, baselbns[level], &blocksreleased)) != 0)
2745                                 allerror = error;
2746                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
2747                             fs->fs_bsize, freeblks->fb_previousinum);
2748                         fs_pendingblocks += nblocks;
2749                         blocksreleased += nblocks;
2750                 }
2751                 /*
2752                  * All direct blocks or frags.
2753                  */
2754                 for (i = (NDADDR - 1); i >= 0; i--) {
2755                         if ((bn = freeblks->fb_dblks[i]) == 0)
2756                                 continue;
2757                         bsize = sblksize(fs, freeblks->fb_oldsize, i);
2758                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2759                             freeblks->fb_previousinum);
2760                         fs_pendingblocks += btodb(bsize);
2761                         blocksreleased += btodb(bsize);
2762                 }
2763         }
2764         UFS_LOCK(ump);
2765         fs->fs_pendingblocks -= fs_pendingblocks;
2766         UFS_UNLOCK(ump);
2767         /*
2768          * If we still have not finished background cleanup, then check
2769          * to see if the block count needs to be adjusted.
2770          */
2771         if (freeblks->fb_chkcnt != blocksreleased &&
2772             (fs->fs_flags & FS_UNCLEAN) != 0 &&
2773             ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
2774             (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
2775                 ip = VTOI(vp);
2776                 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
2777                     freeblks->fb_chkcnt - blocksreleased);
2778                 ip->i_flag |= IN_CHANGE;
2779                 vput(vp);
2780         }
2781
2782 #ifdef INVARIANTS
2783         if (freeblks->fb_chkcnt != blocksreleased &&
2784             ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2785                 printf("handle_workitem_freeblocks: block count\n");
2786         if (allerror)
2787                 softdep_error("handle_workitem_freeblks", allerror);
2788 #endif /* INVARIANTS */
2789
2790         ACQUIRE_LOCK(&lk);
2791         WORKITEM_FREE(freeblks, D_FREEBLKS);
2792         FREE_LOCK(&lk);
2793 }
2794
2795 /*
2796  * Release blocks associated with the inode ip and stored in the indirect
2797  * block dbn. If level is greater than SINGLE, the block is an indirect block
2798  * and recursive calls to indirtrunc must be used to cleanse other indirect
2799  * blocks.
2800  */
2801 static int
2802 indir_trunc(freeblks, dbn, level, lbn, countp)
2803         struct freeblks *freeblks;
2804         ufs2_daddr_t dbn;
2805         int level;
2806         ufs_lbn_t lbn;
2807         ufs2_daddr_t *countp;
2808 {
2809         struct buf *bp;
2810         struct fs *fs;
2811         struct worklist *wk;
2812         struct indirdep *indirdep;
2813         struct ufsmount *ump;
2814         ufs1_daddr_t *bap1 = 0;
2815         ufs2_daddr_t nb, *bap2 = 0;
2816         ufs_lbn_t lbnadd;
2817         int i, nblocks, ufs1fmt;
2818         int error, allerror = 0;
2819         int fs_pendingblocks;
2820
2821         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2822         fs = ump->um_fs;
2823         fs_pendingblocks = 0;
2824         lbnadd = 1;
2825         for (i = level; i > 0; i--)
2826                 lbnadd *= NINDIR(fs);
2827         /*
2828          * Get buffer of block pointers to be freed. This routine is not
2829          * called until the zero'ed inode has been written, so it is safe
2830          * to free blocks as they are encountered. Because the inode has
2831          * been zero'ed, calls to bmap on these blocks will fail. So, we
2832          * have to use the on-disk address and the block device for the
2833          * filesystem to look them up. If the file was deleted before its
2834          * indirect blocks were all written to disk, the routine that set
2835          * us up (deallocate_dependencies) will have arranged to leave
2836          * a complete copy of the indirect block in memory for our use.
2837          * Otherwise we have to read the blocks in from the disk.
2838          */
2839 #ifdef notyet
2840         bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
2841             GB_NOCREAT);
2842 #else
2843         bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
2844 #endif
2845         ACQUIRE_LOCK(&lk);
2846         if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2847                 if (wk->wk_type != D_INDIRDEP ||
2848                     (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2849                     (indirdep->ir_state & GOINGAWAY) == 0)
2850                         panic("indir_trunc: lost indirdep");
2851                 WORKLIST_REMOVE(wk);
2852                 WORKITEM_FREE(indirdep, D_INDIRDEP);
2853                 if (!LIST_EMPTY(&bp->b_dep))
2854                         panic("indir_trunc: dangling dep");
2855                 ump->um_numindirdeps -= 1;
2856                 FREE_LOCK(&lk);
2857         } else {
2858 #ifdef notyet
2859                 if (bp)
2860                         brelse(bp);
2861 #endif
2862                 FREE_LOCK(&lk);
2863                 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2864                     NOCRED, &bp);
2865                 if (error) {
2866                         brelse(bp);
2867                         return (error);
2868                 }
2869         }
2870         /*
2871          * Recursively free indirect blocks.
2872          */
2873         if (ump->um_fstype == UFS1) {
2874                 ufs1fmt = 1;
2875                 bap1 = (ufs1_daddr_t *)bp->b_data;
2876         } else {
2877                 ufs1fmt = 0;
2878                 bap2 = (ufs2_daddr_t *)bp->b_data;
2879         }
2880         nblocks = btodb(fs->fs_bsize);
2881         for (i = NINDIR(fs) - 1; i >= 0; i--) {
2882                 if (ufs1fmt)
2883                         nb = bap1[i];
2884                 else
2885                         nb = bap2[i];
2886                 if (nb == 0)
2887                         continue;
2888                 if (level != 0) {
2889                         if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2890                              level - 1, lbn + (i * lbnadd), countp)) != 0)
2891                                 allerror = error;
2892                 }
2893                 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2894                     freeblks->fb_previousinum);
2895                 fs_pendingblocks += nblocks;
2896                 *countp += nblocks;
2897         }
2898         UFS_LOCK(ump);
2899         fs->fs_pendingblocks -= fs_pendingblocks;
2900         UFS_UNLOCK(ump);
2901         bp->b_flags |= B_INVAL | B_NOCACHE;
2902         brelse(bp);
2903         return (allerror);
2904 }
2905
2906 /*
2907  * Free an allocindir.
2908  * This routine must be called with splbio interrupts blocked.
2909  */
2910 static void
2911 free_allocindir(aip, inodedep)
2912         struct allocindir *aip;
2913         struct inodedep *inodedep;
2914 {
2915         struct freefrag *freefrag;
2916
2917         mtx_assert(&lk, MA_OWNED);
2918         if ((aip->ai_state & DEPCOMPLETE) == 0)
2919                 LIST_REMOVE(aip, ai_deps);
2920         if (aip->ai_state & ONWORKLIST)
2921                 WORKLIST_REMOVE(&aip->ai_list);
2922         LIST_REMOVE(aip, ai_next);
2923         if ((freefrag = aip->ai_freefrag) != NULL) {
2924                 if (inodedep == NULL)
2925                         add_to_worklist(&freefrag->ff_list);
2926                 else
2927                         WORKLIST_INSERT(&inodedep->id_bufwait,
2928                             &freefrag->ff_list);
2929         }
2930         WORKITEM_FREE(aip, D_ALLOCINDIR);
2931 }
2932
2933 /*
2934  * Directory entry addition dependencies.
2935  *
2936  * When adding a new directory entry, the inode (with its incremented link
2937  * count) must be written to disk before the directory entry's pointer to it.
2938  * Also, if the inode is newly allocated, the corresponding freemap must be
2939  * updated (on disk) before the directory entry's pointer. These requirements
2940  * are met via undo/redo on the directory entry's pointer, which consists
2941  * simply of the inode number.
2942  *
2943  * As directory entries are added and deleted, the free space within a
2944  * directory block can become fragmented.  The ufs filesystem will compact
2945  * a fragmented directory block to make space for a new entry. When this
2946  * occurs, the offsets of previously added entries change. Any "diradd"
2947  * dependency structures corresponding to these entries must be updated with
2948  * the new offsets.
2949  */
2950
2951 /*
2952  * This routine is called after the in-memory inode's link
2953  * count has been incremented, but before the directory entry's
2954  * pointer to the inode has been set.
2955  */
2956 int
2957 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2958         struct buf *bp;         /* buffer containing directory block */
2959         struct inode *dp;       /* inode for directory */
2960         off_t diroffset;        /* offset of new entry in directory */
2961         ino_t newinum;          /* inode referenced by new directory entry */
2962         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
2963         int isnewblk;           /* entry is in a newly allocated block */
2964 {
2965         int offset;             /* offset of new entry within directory block */
2966         ufs_lbn_t lbn;          /* block in directory containing new entry */
2967         struct fs *fs;
2968         struct diradd *dap;
2969         struct allocdirect *adp;
2970         struct pagedep *pagedep;
2971         struct inodedep *inodedep;
2972         struct newdirblk *newdirblk = 0;
2973         struct mkdir *mkdir1, *mkdir2;
2974         struct mount *mp;
2975
2976         /*
2977          * Whiteouts have no dependencies.
2978          */
2979         if (newinum == WINO) {
2980                 if (newdirbp != NULL)
2981                         bdwrite(newdirbp);
2982                 return (0);
2983         }
2984         mp = UFSTOVFS(dp->i_ump);
2985         fs = dp->i_fs;
2986         lbn = lblkno(fs, diroffset);
2987         offset = blkoff(fs, diroffset);
2988         MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2989                 M_SOFTDEP_FLAGS|M_ZERO);
2990         workitem_alloc(&dap->da_list, D_DIRADD, mp);
2991         dap->da_offset = offset;
2992         dap->da_newinum = newinum;
2993         dap->da_state = ATTACHED;
2994         if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2995                 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
2996                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
2997                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
2998         }
2999         if (newdirbp == NULL) {
3000                 dap->da_state |= DEPCOMPLETE;
3001                 ACQUIRE_LOCK(&lk);
3002         } else {
3003                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
3004                 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3005                     M_SOFTDEP_FLAGS);
3006                 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
3007                 mkdir1->md_state = MKDIR_BODY;
3008                 mkdir1->md_diradd = dap;
3009                 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3010                     M_SOFTDEP_FLAGS);
3011                 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
3012                 mkdir2->md_state = MKDIR_PARENT;
3013                 mkdir2->md_diradd = dap;
3014                 /*
3015                  * Dependency on "." and ".." being written to disk.
3016                  */
3017                 mkdir1->md_buf = newdirbp;
3018                 ACQUIRE_LOCK(&lk);
3019                 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
3020                 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
3021                 FREE_LOCK(&lk);
3022                 bdwrite(newdirbp);
3023                 /*
3024                  * Dependency on link count increase for parent directory
3025                  */
3026                 ACQUIRE_LOCK(&lk);
3027                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
3028                     || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3029                         dap->da_state &= ~MKDIR_PARENT;
3030                         WORKITEM_FREE(mkdir2, D_MKDIR);
3031                 } else {
3032                         LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
3033                         WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
3034                 }
3035         }
3036         /*
3037          * Link into parent directory pagedep to await its being written.
3038          */
3039         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3040                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3041         dap->da_pagedep = pagedep;
3042         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
3043             da_pdlist);
3044         /*
3045          * Link into its inodedep. Put it on the id_bufwait list if the inode
3046          * is not yet written. If it is written, do the post-inode write
3047          * processing to put it on the id_pendinghd list.
3048          */
3049         (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
3050         if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
3051                 diradd_inode_written(dap, inodedep);
3052         else
3053                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3054         if (isnewblk) {
3055                 /*
3056                  * Directories growing into indirect blocks are rare
3057                  * enough and the frequency of new block allocation
3058                  * in those cases even more rare, that we choose not
3059                  * to bother tracking them. Rather we simply force the
3060                  * new directory entry to disk.
3061                  */
3062                 if (lbn >= NDADDR) {
3063                         FREE_LOCK(&lk);
3064                         /*
3065                          * We only have a new allocation when at the
3066                          * beginning of a new block, not when we are
3067                          * expanding into an existing block.
3068                          */
3069                         if (blkoff(fs, diroffset) == 0)
3070                                 return (1);
3071                         return (0);
3072                 }
3073                 /*
3074                  * We only have a new allocation when at the beginning
3075                  * of a new fragment, not when we are expanding into an
3076                  * existing fragment. Also, there is nothing to do if we
3077                  * are already tracking this block.
3078                  */
3079                 if (fragoff(fs, diroffset) != 0) {
3080                         FREE_LOCK(&lk);
3081                         return (0);
3082                 }
3083                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
3084                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
3085                         FREE_LOCK(&lk);
3086                         return (0);
3087                 }
3088                 /*
3089                  * Find our associated allocdirect and have it track us.
3090                  */
3091                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
3092                         panic("softdep_setup_directory_add: lost inodedep");
3093                 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
3094                 if (adp == NULL || adp->ad_lbn != lbn)
3095                         panic("softdep_setup_directory_add: lost entry");
3096                 pagedep->pd_state |= NEWBLOCK;
3097                 newdirblk->db_pagedep = pagedep;
3098                 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
3099         }
3100         FREE_LOCK(&lk);
3101         return (0);
3102 }
3103
3104 /*
3105  * This procedure is called to change the offset of a directory
3106  * entry when compacting a directory block which must be owned
3107  * exclusively by the caller. Note that the actual entry movement
3108  * must be done in this procedure to ensure that no I/O completions
3109  * occur while the move is in progress.
3110  */
3111 void
3112 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
3113         struct inode *dp;       /* inode for directory */
3114         caddr_t base;           /* address of dp->i_offset */
3115         caddr_t oldloc;         /* address of old directory location */
3116         caddr_t newloc;         /* address of new directory location */
3117         int entrysize;          /* size of directory entry */
3118 {
3119         int offset, oldoffset, newoffset;
3120         struct pagedep *pagedep;
3121         struct diradd *dap;
3122         ufs_lbn_t lbn;
3123
3124         ACQUIRE_LOCK(&lk);
3125         lbn = lblkno(dp->i_fs, dp->i_offset);
3126         offset = blkoff(dp->i_fs, dp->i_offset);
3127         if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
3128                 goto done;
3129         oldoffset = offset + (oldloc - base);
3130         newoffset = offset + (newloc - base);
3131
3132         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
3133                 if (dap->da_offset != oldoffset)
3134                         continue;
3135                 dap->da_offset = newoffset;
3136                 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
3137                         break;
3138                 LIST_REMOVE(dap, da_pdlist);
3139                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
3140                     dap, da_pdlist);
3141                 break;
3142         }
3143         if (dap == NULL) {
3144
3145                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
3146                         if (dap->da_offset == oldoffset) {
3147                                 dap->da_offset = newoffset;
3148                                 break;
3149                         }
3150                 }
3151         }
3152 done:
3153         bcopy(oldloc, newloc, entrysize);
3154         FREE_LOCK(&lk);
3155 }
3156
3157 /*
3158  * Free a diradd dependency structure. This routine must be called
3159  * with splbio interrupts blocked.
3160  */
3161 static void
3162 free_diradd(dap)
3163         struct diradd *dap;
3164 {
3165         struct dirrem *dirrem;
3166         struct pagedep *pagedep;
3167         struct inodedep *inodedep;
3168         struct mkdir *mkdir, *nextmd;
3169
3170         mtx_assert(&lk, MA_OWNED);
3171         WORKLIST_REMOVE(&dap->da_list);
3172         LIST_REMOVE(dap, da_pdlist);
3173         if ((dap->da_state & DIRCHG) == 0) {
3174                 pagedep = dap->da_pagedep;
3175         } else {
3176                 dirrem = dap->da_previous;
3177                 pagedep = dirrem->dm_pagedep;
3178                 dirrem->dm_dirinum = pagedep->pd_ino;
3179                 add_to_worklist(&dirrem->dm_list);
3180         }
3181         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
3182             0, &inodedep) != 0)
3183                 (void) free_inodedep(inodedep);
3184         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
3185                 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
3186                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
3187                         if (mkdir->md_diradd != dap)
3188                                 continue;
3189                         dap->da_state &= ~mkdir->md_state;
3190                         WORKLIST_REMOVE(&mkdir->md_list);
3191                         LIST_REMOVE(mkdir, md_mkdirs);
3192                         WORKITEM_FREE(mkdir, D_MKDIR);
3193                 }
3194                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
3195                         panic("free_diradd: unfound ref");
3196         }
3197         WORKITEM_FREE(dap, D_DIRADD);
3198 }
3199
3200 /*
3201  * Directory entry removal dependencies.
3202  *
3203  * When removing a directory entry, the entry's inode pointer must be
3204  * zero'ed on disk before the corresponding inode's link count is decremented
3205  * (possibly freeing the inode for re-use). This dependency is handled by
3206  * updating the directory entry but delaying the inode count reduction until
3207  * after the directory block has been written to disk. After this point, the
3208  * inode count can be decremented whenever it is convenient.
3209  */
3210
3211 /*
3212  * This routine should be called immediately after removing
3213  * a directory entry.  The inode's link count should not be
3214  * decremented by the calling procedure -- the soft updates
3215  * code will do this task when it is safe.
3216  */
3217 void
3218 softdep_setup_remove(bp, dp, ip, isrmdir)
3219         struct buf *bp;         /* buffer containing directory block */
3220         struct inode *dp;       /* inode for the directory being modified */
3221         struct inode *ip;       /* inode for directory entry being removed */
3222         int isrmdir;            /* indicates if doing RMDIR */
3223 {
3224         struct dirrem *dirrem, *prevdirrem;
3225
3226         /*
3227          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
3228          */
3229         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3230
3231         /*
3232          * If the COMPLETE flag is clear, then there were no active
3233          * entries and we want to roll back to a zeroed entry until
3234          * the new inode is committed to disk. If the COMPLETE flag is
3235          * set then we have deleted an entry that never made it to
3236          * disk. If the entry we deleted resulted from a name change,
3237          * then the old name still resides on disk. We cannot delete
3238          * its inode (returned to us in prevdirrem) until the zeroed
3239          * directory entry gets to disk. The new inode has never been
3240          * referenced on the disk, so can be deleted immediately.
3241          */
3242         if ((dirrem->dm_state & COMPLETE) == 0) {
3243                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
3244                     dm_next);
3245                 FREE_LOCK(&lk);
3246         } else {
3247                 if (prevdirrem != NULL)
3248                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
3249                             prevdirrem, dm_next);
3250                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
3251                 FREE_LOCK(&lk);
3252                 handle_workitem_remove(dirrem, NULL);
3253         }
3254 }
3255
3256 /*
3257  * Allocate a new dirrem if appropriate and return it along with
3258  * its associated pagedep. Called without a lock, returns with lock.
3259  */
3260 static long num_dirrem;         /* number of dirrem allocated */
3261 static struct dirrem *
3262 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
3263         struct buf *bp;         /* buffer containing directory block */
3264         struct inode *dp;       /* inode for the directory being modified */
3265         struct inode *ip;       /* inode for directory entry being removed */
3266         int isrmdir;            /* indicates if doing RMDIR */
3267         struct dirrem **prevdirremp; /* previously referenced inode, if any */
3268 {
3269         int offset;
3270         ufs_lbn_t lbn;
3271         struct diradd *dap;
3272         struct dirrem *dirrem;
3273         struct pagedep *pagedep;
3274
3275         /*
3276          * Whiteouts have no deletion dependencies.
3277          */
3278         if (ip == NULL)
3279                 panic("newdirrem: whiteout");
3280         /*
3281          * If we are over our limit, try to improve the situation.
3282          * Limiting the number of dirrem structures will also limit
3283          * the number of freefile and freeblks structures.
3284          */
3285         ACQUIRE_LOCK(&lk);
3286         if (num_dirrem > max_softdeps / 2)
3287                 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
3288         num_dirrem += 1;
3289         FREE_LOCK(&lk);
3290         MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
3291                 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
3292         workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
3293         dirrem->dm_state = isrmdir ? RMDIR : 0;
3294         dirrem->dm_oldinum = ip->i_number;
3295         *prevdirremp = NULL;
3296
3297         ACQUIRE_LOCK(&lk);
3298         lbn = lblkno(dp->i_fs, dp->i_offset);
3299         offset = blkoff(dp->i_fs, dp->i_offset);
3300         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3301                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3302         dirrem->dm_pagedep = pagedep;
3303         /*
3304          * Check for a diradd dependency for the same directory entry.
3305          * If present, then both dependencies become obsolete and can
3306          * be de-allocated. Check for an entry on both the pd_dirraddhd
3307          * list and the pd_pendinghd list.
3308          */
3309
3310         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
3311                 if (dap->da_offset == offset)
3312                         break;
3313         if (dap == NULL) {
3314
3315                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
3316                         if (dap->da_offset == offset)
3317                                 break;
3318                 if (dap == NULL)
3319                         return (dirrem);
3320         }
3321         /*
3322          * Must be ATTACHED at this point.
3323          */
3324         if ((dap->da_state & ATTACHED) == 0)
3325                 panic("newdirrem: not ATTACHED");
3326         if (dap->da_newinum != ip->i_number)
3327                 panic("newdirrem: inum %d should be %d",
3328                     ip->i_number, dap->da_newinum);
3329         /*
3330          * If we are deleting a changed name that never made it to disk,
3331          * then return the dirrem describing the previous inode (which
3332          * represents the inode currently referenced from this entry on disk).
3333          */
3334         if ((dap->da_state & DIRCHG) != 0) {
3335                 *prevdirremp = dap->da_previous;
3336                 dap->da_state &= ~DIRCHG;
3337                 dap->da_pagedep = pagedep;
3338         }
3339         /*
3340          * We are deleting an entry that never made it to disk.
3341          * Mark it COMPLETE so we can delete its inode immediately.
3342          */
3343         dirrem->dm_state |= COMPLETE;
3344         free_diradd(dap);
3345         return (dirrem);
3346 }
3347
3348 /*
3349  * Directory entry change dependencies.
3350  *
3351  * Changing an existing directory entry requires that an add operation
3352  * be completed first followed by a deletion. The semantics for the addition
3353  * are identical to the description of adding a new entry above except
3354  * that the rollback is to the old inode number rather than zero. Once
3355  * the addition dependency is completed, the removal is done as described
3356  * in the removal routine above.
3357  */
3358
3359 /*
3360  * This routine should be called immediately after changing
3361  * a directory entry.  The inode's link count should not be
3362  * decremented by the calling procedure -- the soft updates
3363  * code will perform this task when it is safe.
3364  */
3365 void
3366 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3367         struct buf *bp;         /* buffer containing directory block */
3368         struct inode *dp;       /* inode for the directory being modified */
3369         struct inode *ip;       /* inode for directory entry being removed */
3370         ino_t newinum;          /* new inode number for changed entry */
3371         int isrmdir;            /* indicates if doing RMDIR */
3372 {
3373         int offset;
3374         struct diradd *dap = NULL;
3375         struct dirrem *dirrem, *prevdirrem;
3376         struct pagedep *pagedep;
3377         struct inodedep *inodedep;
3378         struct mount *mp;
3379
3380         offset = blkoff(dp->i_fs, dp->i_offset);
3381         mp = UFSTOVFS(dp->i_ump);
3382
3383         /*
3384          * Whiteouts do not need diradd dependencies.
3385          */
3386         if (newinum != WINO) {
3387                 MALLOC(dap, struct diradd *, sizeof(struct diradd),
3388                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
3389                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
3390                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3391                 dap->da_offset = offset;
3392                 dap->da_newinum = newinum;
3393         }
3394
3395         /*
3396          * Allocate a new dirrem and ACQUIRE_LOCK.
3397          */
3398         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3399         pagedep = dirrem->dm_pagedep;
3400         /*
3401          * The possible values for isrmdir:
3402          *      0 - non-directory file rename
3403          *      1 - directory rename within same directory
3404          *   inum - directory rename to new directory of given inode number
3405          * When renaming to a new directory, we are both deleting and
3406          * creating a new directory entry, so the link count on the new
3407          * directory should not change. Thus we do not need the followup
3408          * dirrem which is usually done in handle_workitem_remove. We set
3409          * the DIRCHG flag to tell handle_workitem_remove to skip the
3410          * followup dirrem.
3411          */
3412         if (isrmdir > 1)
3413                 dirrem->dm_state |= DIRCHG;
3414
3415         /*
3416          * Whiteouts have no additional dependencies,
3417          * so just put the dirrem on the correct list.
3418          */
3419         if (newinum == WINO) {
3420                 if ((dirrem->dm_state & COMPLETE) == 0) {
3421                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
3422                             dm_next);
3423                 } else {
3424                         dirrem->dm_dirinum = pagedep->pd_ino;
3425                         add_to_worklist(&dirrem->dm_list);
3426                 }
3427                 FREE_LOCK(&lk);
3428                 return;
3429         }
3430
3431         /*
3432          * If the COMPLETE flag is clear, then there were no active
3433          * entries and we want to roll back to the previous inode until
3434          * the new inode is committed to disk. If the COMPLETE flag is
3435          * set, then we have deleted an entry that never made it to disk.
3436          * If the entry we deleted resulted from a name change, then the old
3437          * inode reference still resides on disk. Any rollback that we do
3438          * needs to be to that old inode (returned to us in prevdirrem). If
3439          * the entry we deleted resulted from a create, then there is
3440          * no entry on the disk, so we want to roll back to zero rather
3441          * than the uncommitted inode. In either of the COMPLETE cases we
3442          * want to immediately free the unwritten and unreferenced inode.
3443          */
3444         if ((dirrem->dm_state & COMPLETE) == 0) {
3445                 dap->da_previous = dirrem;
3446         } else {
3447                 if (prevdirrem != NULL) {
3448                         dap->da_previous = prevdirrem;
3449                 } else {
3450                         dap->da_state &= ~DIRCHG;
3451                         dap->da_pagedep = pagedep;
3452                 }
3453                 dirrem->dm_dirinum = pagedep->pd_ino;
3454                 add_to_worklist(&dirrem->dm_list);
3455         }
3456         /*
3457          * Link into its inodedep. Put it on the id_bufwait list if the inode
3458          * is not yet written. If it is written, do the post-inode write
3459          * processing to put it on the id_pendinghd list.
3460          */
3461         if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
3462             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3463                 dap->da_state |= COMPLETE;
3464                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3465                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3466         } else {
3467                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3468                     dap, da_pdlist);
3469                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3470         }
3471         FREE_LOCK(&lk);
3472 }
3473
3474 /*
3475  * Called whenever the link count on an inode is changed.
3476  * It creates an inode dependency so that the new reference(s)
3477  * to the inode cannot be committed to disk until the updated
3478  * inode has been written.
3479  */
3480 void
3481 softdep_change_linkcnt(ip)
3482         struct inode *ip;       /* the inode with the increased link count */
3483 {
3484         struct inodedep *inodedep;
3485
3486         ACQUIRE_LOCK(&lk);
3487         (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3488             DEPALLOC, &inodedep);
3489         if (ip->i_nlink < ip->i_effnlink)
3490                 panic("softdep_change_linkcnt: bad delta");
3491         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3492         FREE_LOCK(&lk);
3493 }
3494
3495 /*
3496  * Called when the effective link count and the reference count
3497  * on an inode drops to zero. At this point there are no names
3498  * referencing the file in the filesystem and no active file
3499  * references. The space associated with the file will be freed
3500  * as soon as the necessary soft dependencies are cleared.
3501  */
3502 void
3503 softdep_releasefile(ip)
3504         struct inode *ip;       /* inode with the zero effective link count */
3505 {
3506         struct inodedep *inodedep;
3507         struct fs *fs;
3508         int extblocks;
3509
3510         if (ip->i_effnlink > 0)
3511                 panic("softdep_releasefile: file still referenced");
3512         /*
3513          * We may be called several times as the on-disk link count
3514          * drops to zero. We only want to account for the space once.
3515          */
3516         if (ip->i_flag & IN_SPACECOUNTED)
3517                 return;
3518         /*
3519          * We have to deactivate a snapshot otherwise copyonwrites may
3520          * add blocks and the cleanup may remove blocks after we have
3521          * tried to account for them.
3522          */
3523         if ((ip->i_flags & SF_SNAPSHOT) != 0)
3524                 ffs_snapremove(ITOV(ip));
3525         /*
3526          * If we are tracking an nlinkdelta, we have to also remember
3527          * whether we accounted for the freed space yet.
3528          */
3529         ACQUIRE_LOCK(&lk);
3530         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
3531                 inodedep->id_state |= SPACECOUNTED;
3532         FREE_LOCK(&lk);
3533         fs = ip->i_fs;
3534         extblocks = 0;
3535         if (fs->fs_magic == FS_UFS2_MAGIC)
3536                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
3537         UFS_LOCK(ip->i_ump);
3538         ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
3539         ip->i_fs->fs_pendinginodes += 1;
3540         UFS_UNLOCK(ip->i_ump);
3541         ip->i_flag |= IN_SPACECOUNTED;
3542 }
3543
3544 /*
3545  * This workitem decrements the inode's link count.
3546  * If the link count reaches zero, the file is removed.
3547  */
3548 static void
3549 handle_workitem_remove(dirrem, xp)
3550         struct dirrem *dirrem;
3551         struct vnode *xp;
3552 {
3553         struct thread *td = curthread;
3554         struct inodedep *inodedep;
3555         struct vnode *vp;
3556         struct inode *ip;
3557         ino_t oldinum;
3558         int error;
3559
3560         if ((vp = xp) == NULL &&
3561             (error = ffs_vget(dirrem->dm_list.wk_mp,
3562             dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
3563                 softdep_error("handle_workitem_remove: vget", error);
3564                 return;
3565         }
3566         ip = VTOI(vp);
3567         ACQUIRE_LOCK(&lk);
3568         if ((inodedep_lookup(dirrem->dm_list.wk_mp,
3569             dirrem->dm_oldinum, 0, &inodedep)) == 0)
3570                 panic("handle_workitem_remove: lost inodedep");
3571         /*
3572          * Normal file deletion.
3573          */
3574         if ((dirrem->dm_state & RMDIR) == 0) {
3575                 ip->i_nlink--;
3576                 DIP_SET(ip, i_nlink, ip->i_nlink);
3577                 ip->i_flag |= IN_CHANGE;
3578                 if (ip->i_nlink < ip->i_effnlink)
3579                         panic("handle_workitem_remove: bad file delta");
3580                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3581                 num_dirrem -= 1;
3582                 WORKITEM_FREE(dirrem, D_DIRREM);
3583                 FREE_LOCK(&lk);
3584                 vput(vp);
3585                 return;
3586         }
3587         /*
3588          * Directory deletion. Decrement reference count for both the
3589          * just deleted parent directory entry and the reference for ".".
3590          * Next truncate the directory to length zero. When the
3591          * truncation completes, arrange to have the reference count on
3592          * the parent decremented to account for the loss of "..".
3593          */
3594         ip->i_nlink -= 2;
3595         DIP_SET(ip, i_nlink, ip->i_nlink);
3596         ip->i_flag |= IN_CHANGE;
3597         if (ip->i_nlink < ip->i_effnlink)
3598                 panic("handle_workitem_remove: bad dir delta");
3599         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3600         FREE_LOCK(&lk);
3601         if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3602                 softdep_error("handle_workitem_remove: truncate", error);
3603         ACQUIRE_LOCK(&lk);
3604         /*
3605          * Rename a directory to a new parent. Since, we are both deleting
3606          * and creating a new directory entry, the link count on the new
3607          * directory should not change. Thus we skip the followup dirrem.
3608          */
3609         if (dirrem->dm_state & DIRCHG) {
3610                 num_dirrem -= 1;
3611                 WORKITEM_FREE(dirrem, D_DIRREM);
3612                 FREE_LOCK(&lk);
3613                 vput(vp);
3614                 return;
3615         }
3616         /*
3617          * If the inodedep does not exist, then the zero'ed inode has
3618          * been written to disk. If the allocated inode has never been
3619          * written to disk, then the on-disk inode is zero'ed. In either
3620          * case we can remove the file immediately.
3621          */
3622         dirrem->dm_state = 0;
3623         oldinum = dirrem->dm_oldinum;
3624         dirrem->dm_oldinum = dirrem->dm_dirinum;
3625         if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
3626             0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
3627                 if (xp != NULL)
3628                         add_to_worklist(&dirrem->dm_list);
3629                 FREE_LOCK(&lk);
3630                 vput(vp);
3631                 if (xp == NULL)
3632                         handle_workitem_remove(dirrem, NULL);
3633                 return;
3634         }
3635         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3636         FREE_LOCK(&lk);
3637         ip->i_flag |= IN_CHANGE;
3638         ffs_update(vp, 0);
3639         vput(vp);
3640 }
3641
3642 /*
3643  * Inode de-allocation dependencies.
3644  *
3645  * When an inode's link count is reduced to zero, it can be de-allocated. We
3646  * found it convenient to postpone de-allocation until after the inode is
3647  * written to disk with its new link count (zero).  At this point, all of the
3648  * on-disk inode's block pointers are nullified and, with careful dependency
3649  * list ordering, all dependencies related to the inode will be satisfied and
3650  * the corresponding dependency structures de-allocated.  So, if/when the
3651  * inode is reused, there will be no mixing of old dependencies with new
3652  * ones.  This artificial dependency is set up by the block de-allocation
3653  * procedure above (softdep_setup_freeblocks) and completed by the
3654  * following procedure.
3655  */
3656 static void
3657 handle_workitem_freefile(freefile)
3658         struct freefile *freefile;
3659 {
3660         struct fs *fs;
3661         struct inodedep *idp;
3662         struct ufsmount *ump;
3663         int error;
3664
3665         ump = VFSTOUFS(freefile->fx_list.wk_mp);
3666         fs = ump->um_fs;
3667 #ifdef DEBUG
3668         ACQUIRE_LOCK(&lk);
3669         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
3670         FREE_LOCK(&lk);
3671         if (error)
3672                 panic("handle_workitem_freefile: inodedep survived");
3673 #endif
3674         UFS_LOCK(ump);
3675         fs->fs_pendinginodes -= 1;
3676         UFS_UNLOCK(ump);
3677         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
3678             freefile->fx_oldinum, freefile->fx_mode)) != 0)
3679                 softdep_error("handle_workitem_freefile", error);
3680         ACQUIRE_LOCK(&lk);
3681         WORKITEM_FREE(freefile, D_FREEFILE);
3682         FREE_LOCK(&lk);
3683 }
3684
3685
3686 /*
3687  * Helper function which unlinks marker element from work list and returns
3688  * the next element on the list.
3689  */
3690 static __inline struct worklist *
3691 markernext(struct worklist *marker)
3692 {
3693         struct worklist *next;
3694
3695         next = LIST_NEXT(marker, wk_list);
3696         LIST_REMOVE(marker, wk_list);
3697         return next;
3698 }
3699
3700 /*
3701  * Disk writes.
3702  *
3703  * The dependency structures constructed above are most actively used when file
3704  * system blocks are written to disk.  No constraints are placed on when a
3705  * block can be written, but unsatisfied update dependencies are made safe by
3706  * modifying (or replacing) the source memory for the duration of the disk
3707  * write.  When the disk write completes, the memory block is again brought
3708  * up-to-date.
3709  *
3710  * In-core inode structure reclamation.
3711  *
3712  * Because there are a finite number of "in-core" inode structures, they are
3713  * reused regularly.  By transferring all inode-related dependencies to the
3714  * in-memory inode block and indexing them separately (via "inodedep"s), we
3715  * can allow "in-core" inode structures to be reused at any time and avoid
3716  * any increase in contention.
3717  *
3718  * Called just before entering the device driver to initiate a new disk I/O.
3719  * The buffer must be locked, thus, no I/O completion operations can occur
3720  * while we are manipulating its associated dependencies.
3721  */
3722 static void
3723 softdep_disk_io_initiation(bp)
3724         struct buf *bp;         /* structure describing disk write to occur */
3725 {
3726         struct worklist *wk;
3727         struct worklist marker;
3728         struct indirdep *indirdep;
3729         struct inodedep *inodedep;
3730
3731         /*
3732          * We only care about write operations. There should never
3733          * be dependencies for reads.
3734          */
3735         if (bp->b_iocmd != BIO_WRITE)
3736                 panic("softdep_disk_io_initiation: not write");
3737
3738         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
3739         PHOLD(curproc);                 /* Don't swap out kernel stack */
3740
3741         ACQUIRE_LOCK(&lk);
3742         /*
3743          * Do any necessary pre-I/O processing.
3744          */
3745         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
3746              wk = markernext(&marker)) {
3747                 LIST_INSERT_AFTER(wk, &marker, wk_list);
3748                 switch (wk->wk_type) {
3749
3750                 case D_PAGEDEP:
3751                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
3752                         continue;
3753
3754                 case D_INODEDEP:
3755                         inodedep = WK_INODEDEP(wk);
3756                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3757                                 initiate_write_inodeblock_ufs1(inodedep, bp);
3758                         else
3759                                 initiate_write_inodeblock_ufs2(inodedep, bp);
3760                         continue;
3761
3762                 case D_INDIRDEP:
3763                         indirdep = WK_INDIRDEP(wk);
3764                         if (indirdep->ir_state & GOINGAWAY)
3765                                 panic("disk_io_initiation: indirdep gone");
3766                         /*
3767                          * If there are no remaining dependencies, this
3768                          * will be writing the real pointers, so the
3769                          * dependency can be freed.
3770                          */
3771                         if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
3772                                 struct buf *bp;
3773
3774                                 bp = indirdep->ir_savebp;
3775                                 bp->b_flags |= B_INVAL | B_NOCACHE;
3776                                 /* inline expand WORKLIST_REMOVE(wk); */
3777                                 wk->wk_state &= ~ONWORKLIST;
3778                                 LIST_REMOVE(wk, wk_list);
3779                                 WORKITEM_FREE(indirdep, D_INDIRDEP);
3780                                 FREE_LOCK(&lk);
3781                                 brelse(bp);
3782                                 ACQUIRE_LOCK(&lk);
3783                                 continue;
3784                         }
3785                         /*
3786                          * Replace up-to-date version with safe version.
3787                          */
3788                         FREE_LOCK(&lk);
3789                         MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3790                             M_INDIRDEP, M_SOFTDEP_FLAGS);
3791                         ACQUIRE_LOCK(&lk);
3792                         indirdep->ir_state &= ~ATTACHED;
3793                         indirdep->ir_state |= UNDONE;
3794                         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3795                         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3796                             bp->b_bcount);
3797                         continue;
3798
3799                 case D_MKDIR:
3800                 case D_BMSAFEMAP:
3801                 case D_ALLOCDIRECT:
3802                 case D_ALLOCINDIR:
3803                         continue;
3804
3805                 default:
3806                         panic("handle_disk_io_initiation: Unexpected type %s",
3807                             TYPENAME(wk->wk_type));
3808                         /* NOTREACHED */
3809                 }
3810         }
3811         FREE_LOCK(&lk);
3812         PRELE(curproc);                 /* Allow swapout of kernel stack */
3813 }
3814
3815 /*
3816  * Called from within the procedure above to deal with unsatisfied
3817  * allocation dependencies in a directory. The buffer must be locked,
3818  * thus, no I/O completion operations can occur while we are
3819  * manipulating its associated dependencies.
3820  */
3821 static void
3822 initiate_write_filepage(pagedep, bp)
3823         struct pagedep *pagedep;
3824         struct buf *bp;
3825 {
3826         struct diradd *dap;
3827         struct direct *ep;
3828         int i;
3829
3830         if (pagedep->pd_state & IOSTARTED) {
3831                 /*
3832                  * This can only happen if there is a driver that does not
3833                  * understand chaining. Here biodone will reissue the call
3834                  * to strategy for the incomplete buffers.
3835                  */
3836                 printf("initiate_write_filepage: already started\n");
3837                 return;
3838         }
3839         pagedep->pd_state |= IOSTARTED;
3840         for (i = 0; i < DAHASHSZ; i++) {
3841                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3842                         ep = (struct direct *)
3843                             ((char *)bp->b_data + dap->da_offset);
3844                         if (ep->d_ino != dap->da_newinum)
3845                                 panic("%s: dir inum %d != new %d",
3846                                     "initiate_write_filepage",
3847                                     ep->d_ino, dap->da_newinum);
3848                         if (dap->da_state & DIRCHG)
3849                                 ep->d_ino = dap->da_previous->dm_oldinum;
3850                         else
3851                                 ep->d_ino = 0;
3852                         dap->da_state &= ~ATTACHED;
3853                         dap->da_state |= UNDONE;
3854                 }
3855         }
3856 }
3857
3858 /*
3859  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
3860  * Note that any bug fixes made to this routine must be done in the
3861  * version found below.
3862  *
3863  * Called from within the procedure above to deal with unsatisfied
3864  * allocation dependencies in an inodeblock. The buffer must be
3865  * locked, thus, no I/O completion operations can occur while we
3866  * are manipulating its associated dependencies.
3867  */
3868 static void
3869 initiate_write_inodeblock_ufs1(inodedep, bp)
3870         struct inodedep *inodedep;
3871         struct buf *bp;                 /* The inode block */
3872 {
3873         struct allocdirect *adp, *lastadp;
3874         struct ufs1_dinode *dp;
3875         struct ufs1_dinode *sip;
3876         struct fs *fs;
3877         ufs_lbn_t i;
3878 #ifdef INVARIANTS
3879         ufs_lbn_t prevlbn = 0;
3880 #endif
3881         int deplist;
3882
3883         if (inodedep->id_state & IOSTARTED)
3884                 panic("initiate_write_inodeblock_ufs1: already started");
3885         inodedep->id_state |= IOSTARTED;
3886         fs = inodedep->id_fs;
3887         dp = (struct ufs1_dinode *)bp->b_data +
3888             ino_to_fsbo(fs, inodedep->id_ino);
3889         /*
3890          * If the bitmap is not yet written, then the allocated
3891          * inode cannot be written to disk.
3892          */
3893         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3894                 if (inodedep->id_savedino1 != NULL)
3895                         panic("initiate_write_inodeblock_ufs1: I/O underway");
3896                 FREE_LOCK(&lk);
3897                 MALLOC(sip, struct ufs1_dinode *,
3898                     sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
3899                 ACQUIRE_LOCK(&lk);
3900                 inodedep->id_savedino1 = sip;
3901                 *inodedep->id_savedino1 = *dp;
3902                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3903                 dp->di_gen = inodedep->id_savedino1->di_gen;
3904                 return;
3905         }
3906         /*
3907          * If no dependencies, then there is nothing to roll back.
3908          */
3909         inodedep->id_savedsize = dp->di_size;
3910         inodedep->id_savedextsize = 0;
3911         if (TAILQ_EMPTY(&inodedep->id_inoupdt))
3912                 return;
3913         /*
3914          * Set the dependencies to busy.
3915          */
3916         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3917              adp = TAILQ_NEXT(adp, ad_next)) {
3918 #ifdef INVARIANTS
3919                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
3920                         panic("softdep_write_inodeblock: lbn order");
3921                 prevlbn = adp->ad_lbn;
3922                 if (adp->ad_lbn < NDADDR &&
3923                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
3924                         panic("%s: direct pointer #%jd mismatch %d != %jd",
3925                             "softdep_write_inodeblock",
3926                             (intmax_t)adp->ad_lbn,
3927                             dp->di_db[adp->ad_lbn],
3928                             (intmax_t)adp->ad_newblkno);
3929                 if (adp->ad_lbn >= NDADDR &&
3930                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
3931                         panic("%s: indirect pointer #%jd mismatch %d != %jd",
3932                             "softdep_write_inodeblock",
3933                             (intmax_t)adp->ad_lbn - NDADDR,
3934                             dp->di_ib[adp->ad_lbn - NDADDR],
3935                             (intmax_t)adp->ad_newblkno);
3936                 deplist |= 1 << adp->ad_lbn;
3937                 if ((adp->ad_state & ATTACHED) == 0)
3938                         panic("softdep_write_inodeblock: Unknown state 0x%x",
3939                             adp->ad_state);
3940 #endif /* INVARIANTS */
3941                 adp->ad_state &= ~ATTACHED;
3942                 adp->ad_state |= UNDONE;
3943         }
3944         /*
3945          * The on-disk inode cannot claim to be any larger than the last
3946          * fragment that has been written. Otherwise, the on-disk inode
3947          * might have fragments that were not the last block in the file
3948          * which would corrupt the filesystem.
3949          */
3950         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3951              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3952                 if (adp->ad_lbn >= NDADDR)
3953                         break;
3954                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3955                 /* keep going until hitting a rollback to a frag */
3956                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3957                         continue;
3958                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3959                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3960 #ifdef INVARIANTS
3961                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
3962                                 panic("softdep_write_inodeblock: lost dep1");
3963 #endif /* INVARIANTS */
3964                         dp->di_db[i] = 0;
3965                 }
3966                 for (i = 0; i < NIADDR; i++) {
3967 #ifdef INVARIANTS
3968                         if (dp->di_ib[i] != 0 &&
3969                             (deplist & ((1 << NDADDR) << i)) == 0)
3970                                 panic("softdep_write_inodeblock: lost dep2");
3971 #endif /* INVARIANTS */
3972                         dp->di_ib[i] = 0;
3973                 }
3974                 return;
3975         }
3976         /*
3977          * If we have zero'ed out the last allocated block of the file,
3978          * roll back the size to the last currently allocated block.
3979          * We know that this last allocated block is a full-sized as
3980          * we already checked for fragments in the loop above.
3981          */
3982         if (lastadp != NULL &&
3983             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3984                 for (i = lastadp->ad_lbn; i >= 0; i--)
3985                         if (dp->di_db[i] != 0)
3986                                 break;
3987                 dp->di_size = (i + 1) * fs->fs_bsize;
3988         }
3989         /*
3990          * The only dependencies are for indirect blocks.
3991          *
3992          * The file size for indirect block additions is not guaranteed.
3993          * Such a guarantee would be non-trivial to achieve. The conventional
3994          * synchronous write implementation also does not make this guarantee.
3995          * Fsck should catch and fix discrepancies. Arguably, the file size
3996          * can be over-estimated without destroying integrity when the file
3997          * moves into the indirect blocks (i.e., is large). If we want to
3998          * postpone fsck, we are stuck with this argument.
3999          */
4000         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4001                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4002 }
4003
4004 /*
4005  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
4006  * Note that any bug fixes made to this routine must be done in the
4007  * version found above.
4008  *
4009  * Called from within the procedure above to deal with unsatisfied
4010  * allocation dependencies in an inodeblock. The buffer must be
4011  * locked, thus, no I/O completion operations can occur while we
4012  * are manipulating its associated dependencies.
4013  */
4014 static void
4015 initiate_write_inodeblock_ufs2(inodedep, bp)
4016         struct inodedep *inodedep;
4017         struct buf *bp;                 /* The inode block */
4018 {
4019         struct allocdirect *adp, *lastadp;
4020         struct ufs2_dinode *dp;
4021         struct ufs2_dinode *sip;
4022         struct fs *fs;
4023         ufs_lbn_t i;
4024 #ifdef INVARIANTS
4025         ufs_lbn_t prevlbn = 0;
4026 #endif
4027         int deplist;
4028
4029         if (inodedep->id_state & IOSTARTED)
4030                 panic("initiate_write_inodeblock_ufs2: already started");
4031         inodedep->id_state |= IOSTARTED;
4032         fs = inodedep->id_fs;
4033         dp = (struct ufs2_dinode *)bp->b_data +
4034             ino_to_fsbo(fs, inodedep->id_ino);
4035         /*
4036          * If the bitmap is not yet written, then the allocated
4037          * inode cannot be written to disk.
4038          */
4039         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4040                 if (inodedep->id_savedino2 != NULL)
4041                         panic("initiate_write_inodeblock_ufs2: I/O underway");
4042                 FREE_LOCK(&lk);
4043                 MALLOC(sip, struct ufs2_dinode *,
4044                     sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
4045                 ACQUIRE_LOCK(&lk);
4046                 inodedep->id_savedino2 = sip;
4047                 *inodedep->id_savedino2 = *dp;
4048                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
4049                 dp->di_gen = inodedep->id_savedino2->di_gen;
4050                 return;
4051         }
4052         /*
4053          * If no dependencies, then there is nothing to roll back.
4054          */
4055         inodedep->id_savedsize = dp->di_size;
4056         inodedep->id_savedextsize = dp->di_extsize;
4057         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
4058             TAILQ_EMPTY(&inodedep->id_extupdt))
4059                 return;
4060         /*
4061          * Set the ext data dependencies to busy.
4062          */
4063         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4064              adp = TAILQ_NEXT(adp, ad_next)) {
4065 #ifdef INVARIANTS
4066                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
4067                         panic("softdep_write_inodeblock: lbn order");
4068                 prevlbn = adp->ad_lbn;
4069                 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
4070                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
4071                             "softdep_write_inodeblock",
4072                             (intmax_t)adp->ad_lbn,
4073                             (intmax_t)dp->di_extb[adp->ad_lbn],
4074                             (intmax_t)adp->ad_newblkno);
4075                 deplist |= 1 << adp->ad_lbn;
4076                 if ((adp->ad_state & ATTACHED) == 0)
4077                         panic("softdep_write_inodeblock: Unknown state 0x%x",
4078                             adp->ad_state);
4079 #endif /* INVARIANTS */
4080                 adp->ad_state &= ~ATTACHED;
4081                 adp->ad_state |= UNDONE;
4082         }
4083         /*
4084          * The on-disk inode cannot claim to be any larger than the last
4085          * fragment that has been written. Otherwise, the on-disk inode
4086          * might have fragments that were not the last block in the ext
4087          * data which would corrupt the filesystem.
4088          */
4089         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4090              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4091                 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
4092                 /* keep going until hitting a rollback to a frag */
4093                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4094                         continue;
4095                 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4096                 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
4097 #ifdef INVARIANTS
4098                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
4099                                 panic("softdep_write_inodeblock: lost dep1");
4100 #endif /* INVARIANTS */
4101                         dp->di_extb[i] = 0;
4102                 }
4103                 lastadp = NULL;
4104                 break;
4105         }
4106         /*
4107          * If we have zero'ed out the last allocated block of the ext
4108          * data, roll back the size to the last currently allocated block.
4109          * We know that this last allocated block is a full-sized as
4110          * we already checked for fragments in the loop above.
4111          */
4112         if (lastadp != NULL &&
4113             dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4114                 for (i = lastadp->ad_lbn; i >= 0; i--)
4115                         if (dp->di_extb[i] != 0)
4116                                 break;
4117                 dp->di_extsize = (i + 1) * fs->fs_bsize;
4118         }
4119         /*
4120          * Set the file data dependencies to busy.
4121          */
4122         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4123              adp = TAILQ_NEXT(adp, ad_next)) {
4124 #ifdef INVARIANTS
4125                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
4126                         panic("softdep_write_inodeblock: lbn order");
4127                 prevlbn = adp->ad_lbn;
4128                 if (adp->ad_lbn < NDADDR &&
4129                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
4130                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
4131                             "softdep_write_inodeblock",
4132                             (intmax_t)adp->ad_lbn,
4133                             (intmax_t)dp->di_db[adp->ad_lbn],
4134                             (intmax_t)adp->ad_newblkno);
4135                 if (adp->ad_lbn >= NDADDR &&
4136                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
4137                         panic("%s indirect pointer #%jd mismatch %jd != %jd",
4138                             "softdep_write_inodeblock:",
4139                             (intmax_t)adp->ad_lbn - NDADDR,
4140                             (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
4141                             (intmax_t)adp->ad_newblkno);
4142                 deplist |= 1 << adp->ad_lbn;
4143                 if ((adp->ad_state & ATTACHED) == 0)
4144                         panic("softdep_write_inodeblock: Unknown state 0x%x",
4145                             adp->ad_state);
4146 #endif /* INVARIANTS */
4147                 adp->ad_state &= ~ATTACHED;
4148                 adp->ad_state |= UNDONE;
4149         }
4150         /*
4151          * The on-disk inode cannot claim to be any larger than the last
4152          * fragment that has been written. Otherwise, the on-disk inode
4153          * might have fragments that were not the last block in the file
4154          * which would corrupt the filesystem.
4155          */
4156         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4157              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4158                 if (adp->ad_lbn >= NDADDR)
4159                         break;
4160                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
4161                 /* keep going until hitting a rollback to a frag */
4162                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4163                         continue;
4164                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4165                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
4166 #ifdef INVARIANTS
4167                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
4168                                 panic("softdep_write_inodeblock: lost dep2");
4169 #endif /* INVARIANTS */
4170                         dp->di_db[i] = 0;
4171                 }
4172                 for (i = 0; i < NIADDR; i++) {
4173 #ifdef INVARIANTS
4174                         if (dp->di_ib[i] != 0 &&
4175                             (deplist & ((1 << NDADDR) << i)) == 0)
4176                                 panic("softdep_write_inodeblock: lost dep3");
4177 #endif /* INVARIANTS */
4178                         dp->di_ib[i] = 0;
4179                 }
4180                 return;
4181         }
4182         /*
4183          * If we have zero'ed out the last allocated block of the file,
4184          * roll back the size to the last currently allocated block.
4185          * We know that this last allocated block is a full-sized as
4186          * we already checked for fragments in the loop above.
4187          */
4188         if (lastadp != NULL &&
4189             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4190                 for (i = lastadp->ad_lbn; i >= 0; i--)
4191                         if (dp->di_db[i] != 0)
4192                                 break;
4193                 dp->di_size = (i + 1) * fs->fs_bsize;
4194         }
4195         /*
4196          * The only dependencies are for indirect blocks.
4197          *
4198          * The file size for indirect block additions is not guaranteed.
4199          * Such a guarantee would be non-trivial to achieve. The conventional
4200          * synchronous write implementation also does not make this guarantee.
4201          * Fsck should catch and fix discrepancies. Arguably, the file size
4202          * can be over-estimated without destroying integrity when the file
4203          * moves into the indirect blocks (i.e., is large). If we want to
4204          * postpone fsck, we are stuck with this argument.
4205          */
4206         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4207                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4208 }
4209
4210 /*
4211  * This routine is called during the completion interrupt
4212  * service routine for a disk write (from the procedure called
4213  * by the device driver to inform the filesystem caches of
4214  * a request completion).  It should be called early in this
4215  * procedure, before the block is made available to other
4216  * processes or other routines are called.
4217  */
4218 static void
4219 softdep_disk_write_complete(bp)
4220         struct buf *bp;         /* describes the completed disk write */
4221 {
4222         struct worklist *wk;
4223         struct worklist *owk;
4224         struct workhead reattach;
4225         struct newblk *newblk;
4226         struct allocindir *aip;
4227         struct allocdirect *adp;
4228         struct indirdep *indirdep;
4229         struct inodedep *inodedep;
4230         struct bmsafemap *bmsafemap;
4231
4232         /*
4233          * If an error occurred while doing the write, then the data
4234          * has not hit the disk and the dependencies cannot be unrolled.
4235          */
4236         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
4237                 return;
4238         LIST_INIT(&reattach);
4239         /*
4240          * This lock must not be released anywhere in this code segment.
4241          */
4242         ACQUIRE_LOCK(&lk);
4243         owk = NULL;
4244         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4245                 WORKLIST_REMOVE(wk);
4246                 if (wk == owk)
4247                         panic("duplicate worklist: %p\n", wk);
4248                 owk = wk;
4249                 switch (wk->wk_type) {
4250
4251                 case D_PAGEDEP:
4252                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
4253                                 WORKLIST_INSERT(&reattach, wk);
4254                         continue;
4255
4256                 case D_INODEDEP:
4257                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
4258                                 WORKLIST_INSERT(&reattach, wk);
4259                         continue;
4260
4261                 case D_BMSAFEMAP:
4262                         bmsafemap = WK_BMSAFEMAP(wk);
4263                         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
4264                                 newblk->nb_state |= DEPCOMPLETE;
4265                                 newblk->nb_bmsafemap = NULL;
4266                                 LIST_REMOVE(newblk, nb_deps);
4267                         }
4268                         while ((adp =
4269                            LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
4270                                 adp->ad_state |= DEPCOMPLETE;
4271                                 adp->ad_buf = NULL;
4272                                 LIST_REMOVE(adp, ad_deps);
4273                                 handle_allocdirect_partdone(adp);
4274                         }
4275                         while ((aip =
4276                             LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
4277                                 aip->ai_state |= DEPCOMPLETE;
4278                                 aip->ai_buf = NULL;
4279                                 LIST_REMOVE(aip, ai_deps);
4280                                 handle_allocindir_partdone(aip);
4281                         }
4282                         while ((inodedep =
4283                              LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
4284                                 inodedep->id_state |= DEPCOMPLETE;
4285                                 LIST_REMOVE(inodedep, id_deps);
4286                                 inodedep->id_buf = NULL;
4287                         }
4288                         WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4289                         continue;
4290
4291                 case D_MKDIR:
4292                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
4293                         continue;
4294
4295                 case D_ALLOCDIRECT:
4296                         adp = WK_ALLOCDIRECT(wk);
4297                         adp->ad_state |= COMPLETE;
4298                         handle_allocdirect_partdone(adp);
4299                         continue;
4300
4301                 case D_ALLOCINDIR:
4302                         aip = WK_ALLOCINDIR(wk);
4303                         aip->ai_state |= COMPLETE;
4304                         handle_allocindir_partdone(aip);
4305                         continue;
4306
4307                 case D_INDIRDEP:
4308                         indirdep = WK_INDIRDEP(wk);
4309                         if (indirdep->ir_state & GOINGAWAY)
4310                                 panic("disk_write_complete: indirdep gone");
4311                         bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
4312                         FREE(indirdep->ir_saveddata, M_INDIRDEP);
4313                         indirdep->ir_saveddata = 0;
4314                         indirdep->ir_state &= ~UNDONE;
4315                         indirdep->ir_state |= ATTACHED;
4316                         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
4317                                 handle_allocindir_partdone(aip);
4318                                 if (aip == LIST_FIRST(&indirdep->ir_donehd))
4319                                         panic("disk_write_complete: not gone");
4320                         }
4321                         WORKLIST_INSERT(&reattach, wk);
4322                         if ((bp->b_flags & B_DELWRI) == 0)
4323                                 stat_indir_blk_ptrs++;
4324                         bdirty(bp);
4325                         continue;
4326
4327                 default:
4328                         panic("handle_disk_write_complete: Unknown type %s",
4329                             TYPENAME(wk->wk_type));
4330                         /* NOTREACHED */
4331                 }
4332         }
4333         /*
4334          * Reattach any requests that must be redone.
4335          */
4336         while ((wk = LIST_FIRST(&reattach)) != NULL) {
4337                 WORKLIST_REMOVE(wk);
4338                 WORKLIST_INSERT(&bp->b_dep, wk);
4339         }
4340         FREE_LOCK(&lk);
4341 }
4342
4343 /*
4344  * Called from within softdep_disk_write_complete above. Note that
4345  * this routine is always called from interrupt level with further
4346  * splbio interrupts blocked.
4347  */
4348 static void
4349 handle_allocdirect_partdone(adp)
4350         struct allocdirect *adp;        /* the completed allocdirect */
4351 {
4352         struct allocdirectlst *listhead;
4353         struct allocdirect *listadp;
4354         struct inodedep *inodedep;
4355         long bsize, delay;
4356
4357         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4358                 return;
4359         if (adp->ad_buf != NULL)
4360                 panic("handle_allocdirect_partdone: dangling dep");
4361         /*
4362          * The on-disk inode cannot claim to be any larger than the last
4363          * fragment that has been written. Otherwise, the on-disk inode
4364          * might have fragments that were not the last block in the file
4365          * which would corrupt the filesystem. Thus, we cannot free any
4366          * allocdirects after one whose ad_oldblkno claims a fragment as
4367          * these blocks must be rolled back to zero before writing the inode.
4368          * We check the currently active set of allocdirects in id_inoupdt
4369          * or id_extupdt as appropriate.
4370          */
4371         inodedep = adp->ad_inodedep;
4372         bsize = inodedep->id_fs->fs_bsize;
4373         if (adp->ad_state & EXTDATA)
4374                 listhead = &inodedep->id_extupdt;
4375         else
4376                 listhead = &inodedep->id_inoupdt;
4377         TAILQ_FOREACH(listadp, listhead, ad_next) {
4378                 /* found our block */
4379                 if (listadp == adp)
4380                         break;
4381                 /* continue if ad_oldlbn is not a fragment */
4382                 if (listadp->ad_oldsize == 0 ||
4383                     listadp->ad_oldsize == bsize)
4384                         continue;
4385                 /* hit a fragment */
4386                 return;
4387         }
4388         /*
4389          * If we have reached the end of the current list without
4390          * finding the just finished dependency, then it must be
4391          * on the future dependency list. Future dependencies cannot
4392          * be freed until they are moved to the current list.
4393          */
4394         if (listadp == NULL) {
4395 #ifdef DEBUG
4396                 if (adp->ad_state & EXTDATA)
4397                         listhead = &inodedep->id_newextupdt;
4398                 else
4399                         listhead = &inodedep->id_newinoupdt;
4400                 TAILQ_FOREACH(listadp, listhead, ad_next)
4401                         /* found our block */
4402                         if (listadp == adp)
4403                                 break;
4404                 if (listadp == NULL)
4405                         panic("handle_allocdirect_partdone: lost dep");
4406 #endif /* DEBUG */
4407                 return;
4408         }
4409         /*
4410          * If we have found the just finished dependency, then free
4411          * it along with anything that follows it that is complete.
4412          * If the inode still has a bitmap dependency, then it has
4413          * never been written to disk, hence the on-disk inode cannot
4414          * reference the old fragment so we can free it without delay.
4415          */
4416         delay = (inodedep->id_state & DEPCOMPLETE);
4417         for (; adp; adp = listadp) {
4418                 listadp = TAILQ_NEXT(adp, ad_next);
4419                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4420                         return;
4421                 free_allocdirect(listhead, adp, delay);
4422         }
4423 }
4424
4425 /*
4426  * Called from within softdep_disk_write_complete above. Note that
4427  * this routine is always called from interrupt level with further
4428  * splbio interrupts blocked.
4429  */
4430 static void
4431 handle_allocindir_partdone(aip)
4432         struct allocindir *aip;         /* the completed allocindir */
4433 {
4434         struct indirdep *indirdep;
4435
4436         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4437                 return;
4438         if (aip->ai_buf != NULL)
4439                 panic("handle_allocindir_partdone: dangling dependency");
4440         indirdep = aip->ai_indirdep;
4441         if (indirdep->ir_state & UNDONE) {
4442                 LIST_REMOVE(aip, ai_next);
4443                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4444                 return;
4445         }
4446         if (indirdep->ir_state & UFS1FMT)
4447                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4448                     aip->ai_newblkno;
4449         else
4450                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4451                     aip->ai_newblkno;
4452         LIST_REMOVE(aip, ai_next);
4453         if (aip->ai_freefrag != NULL)
4454                 add_to_worklist(&aip->ai_freefrag->ff_list);
4455         WORKITEM_FREE(aip, D_ALLOCINDIR);
4456 }
4457
4458 /*
4459  * Called from within softdep_disk_write_complete above to restore
4460  * in-memory inode block contents to their most up-to-date state. Note
4461  * that this routine is always called from interrupt level with further
4462  * splbio interrupts blocked.
4463  */
4464 static int
4465 handle_written_inodeblock(inodedep, bp)
4466         struct inodedep *inodedep;
4467         struct buf *bp;         /* buffer containing the inode block */
4468 {
4469         struct worklist *wk, *filefree;
4470         struct allocdirect *adp, *nextadp;
4471         struct ufs1_dinode *dp1 = NULL;
4472         struct ufs2_dinode *dp2 = NULL;
4473         int hadchanges, fstype;
4474
4475         if ((inodedep->id_state & IOSTARTED) == 0)
4476                 panic("handle_written_inodeblock: not started");
4477         inodedep->id_state &= ~IOSTARTED;
4478         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4479                 fstype = UFS1;
4480                 dp1 = (struct ufs1_dinode *)bp->b_data +
4481                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4482         } else {
4483                 fstype = UFS2;
4484                 dp2 = (struct ufs2_dinode *)bp->b_data +
4485                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4486         }
4487         /*
4488          * If we had to rollback the inode allocation because of
4489          * bitmaps being incomplete, then simply restore it.
4490          * Keep the block dirty so that it will not be reclaimed until
4491          * all associated dependencies have been cleared and the
4492          * corresponding updates written to disk.
4493          */
4494         if (inodedep->id_savedino1 != NULL) {
4495                 if (fstype == UFS1)
4496                         *dp1 = *inodedep->id_savedino1;
4497                 else
4498                         *dp2 = *inodedep->id_savedino2;
4499                 FREE(inodedep->id_savedino1, M_SAVEDINO);
4500                 inodedep->id_savedino1 = NULL;
4501                 if ((bp->b_flags & B_DELWRI) == 0)
4502                         stat_inode_bitmap++;
4503                 bdirty(bp);
4504                 return (1);
4505         }
4506         inodedep->id_state |= COMPLETE;
4507         /*
4508          * Roll forward anything that had to be rolled back before
4509          * the inode could be updated.
4510          */
4511         hadchanges = 0;
4512         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4513                 nextadp = TAILQ_NEXT(adp, ad_next);
4514                 if (adp->ad_state & ATTACHED)
4515                         panic("handle_written_inodeblock: new entry");
4516                 if (fstype == UFS1) {
4517                         if (adp->ad_lbn < NDADDR) {
4518                                 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4519                                         panic("%s %s #%jd mismatch %d != %jd",
4520                                             "handle_written_inodeblock:",
4521                                             "direct pointer",
4522                                             (intmax_t)adp->ad_lbn,
4523                                             dp1->di_db[adp->ad_lbn],
4524                                             (intmax_t)adp->ad_oldblkno);
4525                                 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4526                         } else {
4527                                 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4528                                         panic("%s: %s #%jd allocated as %d",
4529                                             "handle_written_inodeblock",
4530                                             "indirect pointer",
4531                                             (intmax_t)adp->ad_lbn - NDADDR,
4532                                             dp1->di_ib[adp->ad_lbn - NDADDR]);
4533                                 dp1->di_ib[adp->ad_lbn - NDADDR] =
4534                                     adp->ad_newblkno;
4535                         }
4536                 } else {
4537                         if (adp->ad_lbn < NDADDR) {
4538                                 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4539                                         panic("%s: %s #%jd %s %jd != %jd",
4540                                             "handle_written_inodeblock",
4541                                             "direct pointer",
4542                                             (intmax_t)adp->ad_lbn, "mismatch",
4543                                             (intmax_t)dp2->di_db[adp->ad_lbn],
4544                                             (intmax_t)adp->ad_oldblkno);
4545                                 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4546                         } else {
4547                                 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4548                                         panic("%s: %s #%jd allocated as %jd",
4549                                             "handle_written_inodeblock",
4550                                             "indirect pointer",
4551                                             (intmax_t)adp->ad_lbn - NDADDR,
4552                                             (intmax_t)
4553                                             dp2->di_ib[adp->ad_lbn - NDADDR]);
4554                                 dp2->di_ib[adp->ad_lbn - NDADDR] =
4555                                     adp->ad_newblkno;
4556                         }
4557                 }
4558                 adp->ad_state &= ~UNDONE;
4559                 adp->ad_state |= ATTACHED;
4560                 hadchanges = 1;
4561         }
4562         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
4563                 nextadp = TAILQ_NEXT(adp, ad_next);
4564                 if (adp->ad_state & ATTACHED)
4565                         panic("handle_written_inodeblock: new entry");
4566                 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
4567                         panic("%s: direct pointers #%jd %s %jd != %jd",
4568                             "handle_written_inodeblock",
4569                             (intmax_t)adp->ad_lbn, "mismatch",
4570                             (intmax_t)dp2->di_extb[adp->ad_lbn],
4571                             (intmax_t)adp->ad_oldblkno);
4572                 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
4573                 adp->ad_state &= ~UNDONE;
4574                 adp->ad_state |= ATTACHED;
4575                 hadchanges = 1;
4576         }
4577         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4578                 stat_direct_blk_ptrs++;
4579         /*
4580          * Reset the file size to its most up-to-date value.
4581          */
4582         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
4583                 panic("handle_written_inodeblock: bad size");
4584         if (fstype == UFS1) {
4585                 if (dp1->di_size != inodedep->id_savedsize) {
4586                         dp1->di_size = inodedep->id_savedsize;
4587                         hadchanges = 1;
4588                 }
4589         } else {
4590                 if (dp2->di_size != inodedep->id_savedsize) {
4591                         dp2->di_size = inodedep->id_savedsize;
4592                         hadchanges = 1;
4593                 }
4594                 if (dp2->di_extsize != inodedep->id_savedextsize) {
4595                         dp2->di_extsize = inodedep->id_savedextsize;
4596                         hadchanges = 1;
4597                 }
4598         }
4599         inodedep->id_savedsize = -1;
4600         inodedep->id_savedextsize = -1;
4601         /*
4602          * If there were any rollbacks in the inode block, then it must be
4603          * marked dirty so that its will eventually get written back in
4604          * its correct form.
4605          */
4606         if (hadchanges)
4607                 bdirty(bp);
4608         /*
4609          * Process any allocdirects that completed during the update.
4610          */
4611         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4612                 handle_allocdirect_partdone(adp);
4613         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
4614                 handle_allocdirect_partdone(adp);
4615         /*
4616          * Process deallocations that were held pending until the
4617          * inode had been written to disk. Freeing of the inode
4618          * is delayed until after all blocks have been freed to
4619          * avoid creation of new <vfsid, inum, lbn> triples
4620          * before the old ones have been deleted.
4621          */
4622         filefree = NULL;
4623         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4624                 WORKLIST_REMOVE(wk);
4625                 switch (wk->wk_type) {
4626
4627                 case D_FREEFILE:
4628                         /*
4629                          * We defer adding filefree to the worklist until
4630                          * all other additions have been made to ensure
4631                          * that it will be done after all the old blocks
4632                          * have been freed.
4633                          */
4634                         if (filefree != NULL)
4635                                 panic("handle_written_inodeblock: filefree");
4636                         filefree = wk;
4637                         continue;
4638
4639                 case D_MKDIR:
4640                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4641                         continue;
4642
4643                 case D_DIRADD:
4644                         diradd_inode_written(WK_DIRADD(wk), inodedep);
4645                         continue;
4646
4647                 case D_FREEBLKS:
4648                         wk->wk_state |= COMPLETE;
4649                         if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
4650                                 continue;
4651                          /* -- fall through -- */
4652                 case D_FREEFRAG:
4653                 case D_DIRREM:
4654                         add_to_worklist(wk);
4655                         continue;
4656
4657                 case D_NEWDIRBLK:
4658                         free_newdirblk(WK_NEWDIRBLK(wk));
4659                         continue;
4660
4661                 default:
4662                         panic("handle_written_inodeblock: Unknown type %s",
4663                             TYPENAME(wk->wk_type));
4664                         /* NOTREACHED */
4665                 }
4666         }
4667         if (filefree != NULL) {
4668                 if (free_inodedep(inodedep) == 0)
4669                         panic("handle_written_inodeblock: live inodedep");
4670                 add_to_worklist(filefree);
4671                 return (0);
4672         }
4673
4674         /*
4675          * If no outstanding dependencies, free it.
4676          */
4677         if (free_inodedep(inodedep) ||
4678             (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
4679              TAILQ_FIRST(&inodedep->id_extupdt) == 0))
4680                 return (0);
4681         return (hadchanges);
4682 }
4683
4684 /*
4685  * Process a diradd entry after its dependent inode has been written.
4686  * This routine must be called with splbio interrupts blocked.
4687  */
4688 static void
4689 diradd_inode_written(dap, inodedep)
4690         struct diradd *dap;
4691         struct inodedep *inodedep;
4692 {
4693         struct pagedep *pagedep;
4694
4695         dap->da_state |= COMPLETE;
4696         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4697                 if (dap->da_state & DIRCHG)
4698                         pagedep = dap->da_previous->dm_pagedep;
4699                 else
4700                         pagedep = dap->da_pagedep;
4701                 LIST_REMOVE(dap, da_pdlist);
4702                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4703         }
4704         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4705 }
4706
4707 /*
4708  * Handle the completion of a mkdir dependency.
4709  */
4710 static void
4711 handle_written_mkdir(mkdir, type)
4712         struct mkdir *mkdir;
4713         int type;
4714 {
4715         struct diradd *dap;
4716         struct pagedep *pagedep;
4717
4718         if (mkdir->md_state != type)
4719                 panic("handle_written_mkdir: bad type");
4720         dap = mkdir->md_diradd;
4721         dap->da_state &= ~type;
4722         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4723                 dap->da_state |= DEPCOMPLETE;
4724         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4725                 if (dap->da_state & DIRCHG)
4726                         pagedep = dap->da_previous->dm_pagedep;
4727                 else
4728                         pagedep = dap->da_pagedep;
4729                 LIST_REMOVE(dap, da_pdlist);
4730                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4731         }
4732         LIST_REMOVE(mkdir, md_mkdirs);
4733         WORKITEM_FREE(mkdir, D_MKDIR);
4734 }
4735
4736 /*
4737  * Called from within softdep_disk_write_complete above.
4738  * A write operation was just completed. Removed inodes can
4739  * now be freed and associated block pointers may be committed.
4740  * Note that this routine is always called from interrupt level
4741  * with further splbio interrupts blocked.
4742  */
4743 static int
4744 handle_written_filepage(pagedep, bp)
4745         struct pagedep *pagedep;
4746         struct buf *bp;         /* buffer containing the written page */
4747 {
4748         struct dirrem *dirrem;
4749         struct diradd *dap, *nextdap;
4750         struct direct *ep;
4751         int i, chgs;
4752
4753         if ((pagedep->pd_state & IOSTARTED) == 0)
4754                 panic("handle_written_filepage: not started");
4755         pagedep->pd_state &= ~IOSTARTED;
4756         /*
4757          * Process any directory removals that have been committed.
4758          */
4759         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4760                 LIST_REMOVE(dirrem, dm_next);
4761                 dirrem->dm_dirinum = pagedep->pd_ino;
4762                 add_to_worklist(&dirrem->dm_list);
4763         }
4764         /*
4765          * Free any directory additions that have been committed.
4766          * If it is a newly allocated block, we have to wait until
4767          * the on-disk directory inode claims the new block.
4768          */
4769         if ((pagedep->pd_state & NEWBLOCK) == 0)
4770                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4771                         free_diradd(dap);
4772         /*
4773          * Uncommitted directory entries must be restored.
4774          */
4775         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4776                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4777                      dap = nextdap) {
4778                         nextdap = LIST_NEXT(dap, da_pdlist);
4779                         if (dap->da_state & ATTACHED)
4780                                 panic("handle_written_filepage: attached");
4781                         ep = (struct direct *)
4782                             ((char *)bp->b_data + dap->da_offset);
4783                         ep->d_ino = dap->da_newinum;
4784                         dap->da_state &= ~UNDONE;
4785                         dap->da_state |= ATTACHED;
4786                         chgs = 1;
4787                         /*
4788                          * If the inode referenced by the directory has
4789                          * been written out, then the dependency can be
4790                          * moved to the pending list.
4791                          */
4792                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4793                                 LIST_REMOVE(dap, da_pdlist);
4794                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4795                                     da_pdlist);
4796                         }
4797                 }
4798         }
4799         /*
4800          * If there were any rollbacks in the directory, then it must be
4801          * marked dirty so that its will eventually get written back in
4802          * its correct form.
4803          */
4804         if (chgs) {
4805                 if ((bp->b_flags & B_DELWRI) == 0)
4806                         stat_dir_entry++;
4807                 bdirty(bp);
4808                 return (1);
4809         }
4810         /*
4811          * If we are not waiting for a new directory block to be
4812          * claimed by its inode, then the pagedep will be freed.
4813          * Otherwise it will remain to track any new entries on
4814          * the page in case they are fsync'ed.
4815          */
4816         if ((pagedep->pd_state & NEWBLOCK) == 0) {
4817                 LIST_REMOVE(pagedep, pd_hash);
4818                 WORKITEM_FREE(pagedep, D_PAGEDEP);
4819         }
4820         return (0);
4821 }
4822
4823 /*
4824  * Writing back in-core inode structures.
4825  *
4826  * The filesystem only accesses an inode's contents when it occupies an
4827  * "in-core" inode structure.  These "in-core" structures are separate from
4828  * the page frames used to cache inode blocks.  Only the latter are
4829  * transferred to/from the disk.  So, when the updated contents of the
4830  * "in-core" inode structure are copied to the corresponding in-memory inode
4831  * block, the dependencies are also transferred.  The following procedure is
4832  * called when copying a dirty "in-core" inode to a cached inode block.
4833  */
4834
4835 /*
4836  * Called when an inode is loaded from disk. If the effective link count
4837  * differed from the actual link count when it was last flushed, then we
4838  * need to ensure that the correct effective link count is put back.
4839  */
4840 void
4841 softdep_load_inodeblock(ip)
4842         struct inode *ip;       /* the "in_core" copy of the inode */
4843 {
4844         struct inodedep *inodedep;
4845
4846         /*
4847          * Check for alternate nlink count.
4848          */
4849         ip->i_effnlink = ip->i_nlink;
4850         ACQUIRE_LOCK(&lk);
4851         if (inodedep_lookup(UFSTOVFS(ip->i_ump),
4852             ip->i_number, 0, &inodedep) == 0) {
4853                 FREE_LOCK(&lk);
4854                 return;
4855         }
4856         ip->i_effnlink -= inodedep->id_nlinkdelta;
4857         if (inodedep->id_state & SPACECOUNTED)
4858                 ip->i_flag |= IN_SPACECOUNTED;
4859         FREE_LOCK(&lk);
4860 }
4861
4862 /*
4863  * This routine is called just before the "in-core" inode
4864  * information is to be copied to the in-memory inode block.
4865  * Recall that an inode block contains several inodes. If
4866  * the force flag is set, then the dependencies will be
4867  * cleared so that the update can always be made. Note that
4868  * the buffer is locked when this routine is called, so we
4869  * will never be in the middle of writing the inode block
4870  * to disk.
4871  */
4872 void
4873 softdep_update_inodeblock(ip, bp, waitfor)
4874         struct inode *ip;       /* the "in_core" copy of the inode */
4875         struct buf *bp;         /* the buffer containing the inode block */
4876         int waitfor;            /* nonzero => update must be allowed */
4877 {
4878         struct inodedep *inodedep;
4879         struct worklist *wk;
4880         struct mount *mp;
4881         struct buf *ibp;
4882         int error;
4883
4884         /*
4885          * If the effective link count is not equal to the actual link
4886          * count, then we must track the difference in an inodedep while
4887          * the inode is (potentially) tossed out of the cache. Otherwise,
4888          * if there is no existing inodedep, then there are no dependencies
4889          * to track.
4890          */
4891         mp = UFSTOVFS(ip->i_ump);
4892         ACQUIRE_LOCK(&lk);
4893         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
4894                 FREE_LOCK(&lk);
4895                 if (ip->i_effnlink != ip->i_nlink)
4896                         panic("softdep_update_inodeblock: bad link count");
4897                 return;
4898         }
4899         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
4900                 panic("softdep_update_inodeblock: bad delta");
4901         /*
4902          * Changes have been initiated. Anything depending on these
4903          * changes cannot occur until this inode has been written.
4904          */
4905         inodedep->id_state &= ~COMPLETE;
4906         if ((inodedep->id_state & ONWORKLIST) == 0)
4907                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4908         /*
4909          * Any new dependencies associated with the incore inode must
4910          * now be moved to the list associated with the buffer holding
4911          * the in-memory copy of the inode. Once merged process any
4912          * allocdirects that are completed by the merger.
4913          */
4914         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
4915         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
4916                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4917         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
4918         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
4919                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
4920         /*
4921          * Now that the inode has been pushed into the buffer, the
4922          * operations dependent on the inode being written to disk
4923          * can be moved to the id_bufwait so that they will be
4924          * processed when the buffer I/O completes.
4925          */
4926         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4927                 WORKLIST_REMOVE(wk);
4928                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4929         }
4930         /*
4931          * Newly allocated inodes cannot be written until the bitmap
4932          * that allocates them have been written (indicated by
4933          * DEPCOMPLETE being set in id_state). If we are doing a
4934          * forced sync (e.g., an fsync on a file), we force the bitmap
4935          * to be written so that the update can be done.
4936          */
4937         if (waitfor == 0) {
4938                 FREE_LOCK(&lk);
4939                 return;
4940         }
4941 retry:
4942         if ((inodedep->id_state & DEPCOMPLETE) != 0) {
4943                 FREE_LOCK(&lk);
4944                 return;
4945         }
4946         ibp = inodedep->id_buf;
4947         ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
4948         if (ibp == NULL) {
4949                 /*
4950                  * If ibp came back as NULL, the dependency could have been
4951                  * freed while we slept.  Look it up again, and check to see
4952                  * that it has completed.
4953                  */
4954                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
4955                         goto retry;
4956                 FREE_LOCK(&lk);
4957                 return;
4958         }
4959         FREE_LOCK(&lk);
4960         if ((error = bwrite(ibp)) != 0)
4961                 softdep_error("softdep_update_inodeblock: bwrite", error);
4962 }
4963
4964 /*
4965  * Merge the a new inode dependency list (such as id_newinoupdt) into an
4966  * old inode dependency list (such as id_inoupdt). This routine must be
4967  * called with splbio interrupts blocked.
4968  */
4969 static void
4970 merge_inode_lists(newlisthead, oldlisthead)
4971         struct allocdirectlst *newlisthead;
4972         struct allocdirectlst *oldlisthead;
4973 {
4974         struct allocdirect *listadp, *newadp;
4975
4976         newadp = TAILQ_FIRST(newlisthead);
4977         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
4978                 if (listadp->ad_lbn < newadp->ad_lbn) {
4979                         listadp = TAILQ_NEXT(listadp, ad_next);
4980                         continue;
4981                 }
4982                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
4983                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4984                 if (listadp->ad_lbn == newadp->ad_lbn) {
4985                         allocdirect_merge(oldlisthead, newadp,
4986                             listadp);
4987                         listadp = newadp;
4988                 }
4989                 newadp = TAILQ_FIRST(newlisthead);
4990         }
4991         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
4992                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
4993                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
4994         }
4995 }
4996
4997 /*
4998  * If we are doing an fsync, then we must ensure that any directory
4999  * entries for the inode have been written after the inode gets to disk.
5000  */
5001 int
5002 softdep_fsync(vp)
5003         struct vnode *vp;       /* the "in_core" copy of the inode */
5004 {
5005         struct inodedep *inodedep;
5006         struct pagedep *pagedep;
5007         struct worklist *wk;
5008         struct diradd *dap;
5009         struct mount *mp;
5010         struct vnode *pvp;
5011         struct inode *ip;
5012         struct buf *bp;
5013         struct fs *fs;
5014         struct thread *td = curthread;
5015         int error, flushparent, pagedep_new_block;
5016         ino_t parentino;
5017         ufs_lbn_t lbn;
5018
5019         ip = VTOI(vp);
5020         fs = ip->i_fs;
5021         mp = vp->v_mount;
5022         ACQUIRE_LOCK(&lk);
5023         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
5024                 FREE_LOCK(&lk);
5025                 return (0);
5026         }
5027         if (!LIST_EMPTY(&inodedep->id_inowait) ||
5028             !LIST_EMPTY(&inodedep->id_bufwait) ||
5029             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5030             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5031             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5032             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
5033                 panic("softdep_fsync: pending ops");
5034         for (error = 0, flushparent = 0; ; ) {
5035                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
5036                         break;
5037                 if (wk->wk_type != D_DIRADD)
5038                         panic("softdep_fsync: Unexpected type %s",
5039                             TYPENAME(wk->wk_type));
5040                 dap = WK_DIRADD(wk);
5041                 /*
5042                  * Flush our parent if this directory entry has a MKDIR_PARENT
5043                  * dependency or is contained in a newly allocated block.
5044                  */
5045                 if (dap->da_state & DIRCHG)
5046                         pagedep = dap->da_previous->dm_pagedep;
5047                 else
5048                         pagedep = dap->da_pagedep;
5049                 parentino = pagedep->pd_ino;
5050                 lbn = pagedep->pd_lbn;
5051                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
5052                         panic("softdep_fsync: dirty");
5053                 if ((dap->da_state & MKDIR_PARENT) ||
5054                     (pagedep->pd_state & NEWBLOCK))
5055                         flushparent = 1;
5056                 else
5057                         flushparent = 0;
5058                 /*
5059                  * If we are being fsync'ed as part of vgone'ing this vnode,
5060                  * then we will not be able to release and recover the
5061                  * vnode below, so we just have to give up on writing its
5062                  * directory entry out. It will eventually be written, just
5063                  * not now, but then the user was not asking to have it
5064                  * written, so we are not breaking any promises.
5065                  */
5066                 if (vp->v_iflag & VI_DOOMED)
5067                         break;
5068                 /*
5069                  * We prevent deadlock by always fetching inodes from the
5070                  * root, moving down the directory tree. Thus, when fetching
5071                  * our parent directory, we first try to get the lock. If
5072                  * that fails, we must unlock ourselves before requesting
5073                  * the lock on our parent. See the comment in ufs_lookup
5074                  * for details on possible races.
5075                  */
5076                 FREE_LOCK(&lk);
5077                 if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
5078                         VOP_UNLOCK(vp, 0);
5079                         error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
5080                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5081                         if (error != 0)
5082                                 return (error);
5083                 }
5084                 /*
5085                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
5086                  * that are contained in direct blocks will be resolved by
5087                  * doing a ffs_update. Pagedeps contained in indirect blocks
5088                  * may require a complete sync'ing of the directory. So, we
5089                  * try the cheap and fast ffs_update first, and if that fails,
5090                  * then we do the slower ffs_syncvnode of the directory.
5091                  */
5092                 if (flushparent) {
5093                         int locked;
5094
5095                         if ((error = ffs_update(pvp, 1)) != 0) {
5096                                 vput(pvp);
5097                                 return (error);
5098                         }
5099                         ACQUIRE_LOCK(&lk);
5100                         locked = 1;
5101                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
5102                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
5103                                         if (wk->wk_type != D_DIRADD)
5104                                                 panic("softdep_fsync: Unexpected type %s",
5105                                                       TYPENAME(wk->wk_type));
5106                                         dap = WK_DIRADD(wk);
5107                                         if (dap->da_state & DIRCHG)
5108                                                 pagedep = dap->da_previous->dm_pagedep;
5109                                         else
5110                                                 pagedep = dap->da_pagedep;
5111                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
5112                                         FREE_LOCK(&lk);
5113                                         locked = 0;
5114                                         if (pagedep_new_block &&
5115                                             (error = ffs_syncvnode(pvp, MNT_WAIT))) {
5116                                                 vput(pvp);
5117                                                 return (error);
5118                                         }
5119                                 }
5120                         }
5121                         if (locked)
5122                                 FREE_LOCK(&lk);
5123                 }
5124                 /*
5125                  * Flush directory page containing the inode's name.
5126                  */
5127                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
5128                     &bp);
5129                 if (error == 0)
5130                         error = bwrite(bp);
5131                 else
5132                         brelse(bp);
5133                 vput(pvp);
5134                 if (error != 0)
5135                         return (error);
5136                 ACQUIRE_LOCK(&lk);
5137                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
5138                         break;
5139         }
5140         FREE_LOCK(&lk);
5141         return (0);
5142 }
5143
5144 /*
5145  * Flush all the dirty bitmaps associated with the block device
5146  * before flushing the rest of the dirty blocks so as to reduce
5147  * the number of dependencies that will have to be rolled back.
5148  */
5149 void
5150 softdep_fsync_mountdev(vp)
5151         struct vnode *vp;
5152 {
5153         struct buf *bp, *nbp;
5154         struct worklist *wk;
5155         struct bufobj *bo;
5156
5157         if (!vn_isdisk(vp, NULL))
5158                 panic("softdep_fsync_mountdev: vnode not a disk");
5159         bo = &vp->v_bufobj;
5160 restart:
5161         BO_LOCK(bo);
5162         ACQUIRE_LOCK(&lk);
5163         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
5164                 /*
5165                  * If it is already scheduled, skip to the next buffer.
5166                  */
5167                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
5168                         continue;
5169
5170                 if ((bp->b_flags & B_DELWRI) == 0)
5171                         panic("softdep_fsync_mountdev: not dirty");
5172                 /*
5173                  * We are only interested in bitmaps with outstanding
5174                  * dependencies.
5175                  */
5176                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
5177                     wk->wk_type != D_BMSAFEMAP ||
5178                     (bp->b_vflags & BV_BKGRDINPROG)) {
5179                         BUF_UNLOCK(bp);
5180                         continue;
5181                 }
5182                 FREE_LOCK(&lk);
5183                 BO_UNLOCK(bo);
5184                 bremfree(bp);
5185                 (void) bawrite(bp);
5186                 goto restart;
5187         }
5188         FREE_LOCK(&lk);
5189         drain_output(vp);
5190         BO_UNLOCK(bo);
5191 }
5192
5193 /*
5194  * This routine is called when we are trying to synchronously flush a
5195  * file. This routine must eliminate any filesystem metadata dependencies
5196  * so that the syncing routine can succeed by pushing the dirty blocks
5197  * associated with the file. If any I/O errors occur, they are returned.
5198  */
5199 int
5200 softdep_sync_metadata(struct vnode *vp)
5201 {
5202         struct pagedep *pagedep;
5203         struct allocdirect *adp;
5204         struct allocindir *aip;
5205         struct buf *bp, *nbp;
5206         struct worklist *wk;
5207         struct bufobj *bo;
5208         int i, error, waitfor;
5209
5210         if (!DOINGSOFTDEP(vp))
5211                 return (0);
5212         /*
5213          * Ensure that any direct block dependencies have been cleared.
5214          */
5215         ACQUIRE_LOCK(&lk);
5216         if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
5217                 FREE_LOCK(&lk);
5218                 return (error);
5219         }
5220         FREE_LOCK(&lk);
5221         /*
5222          * For most files, the only metadata dependencies are the
5223          * cylinder group maps that allocate their inode or blocks.
5224          * The block allocation dependencies can be found by traversing
5225          * the dependency lists for any buffers that remain on their
5226          * dirty buffer list. The inode allocation dependency will
5227          * be resolved when the inode is updated with MNT_WAIT.
5228          * This work is done in two passes. The first pass grabs most
5229          * of the buffers and begins asynchronously writing them. The
5230          * only way to wait for these asynchronous writes is to sleep
5231          * on the filesystem vnode which may stay busy for a long time
5232          * if the filesystem is active. So, instead, we make a second
5233          * pass over the dependencies blocking on each write. In the
5234          * usual case we will be blocking against a write that we
5235          * initiated, so when it is done the dependency will have been
5236          * resolved. Thus the second pass is expected to end quickly.
5237          */
5238         waitfor = MNT_NOWAIT;
5239         bo = &vp->v_bufobj;
5240
5241 top:
5242         /*
5243          * We must wait for any I/O in progress to finish so that
5244          * all potential buffers on the dirty list will be visible.
5245          */
5246         BO_LOCK(bo);
5247         drain_output(vp);
5248         while ((bp = TAILQ_FIRST(&bo->bo_dirty.bv_hd)) != NULL) {
5249                 bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT);
5250                 if (bp)
5251                         break;
5252         }
5253         BO_UNLOCK(bo);
5254         if (bp == NULL)
5255                 return (0);
5256 loop:
5257         /* While syncing snapshots, we must allow recursive lookups */
5258         BUF_AREC(bp);
5259         ACQUIRE_LOCK(&lk);
5260         /*
5261          * As we hold the buffer locked, none of its dependencies
5262          * will disappear.
5263          */
5264         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5265                 switch (wk->wk_type) {
5266
5267                 case D_ALLOCDIRECT:
5268                         adp = WK_ALLOCDIRECT(wk);
5269                         if (adp->ad_state & DEPCOMPLETE)
5270                                 continue;
5271                         nbp = adp->ad_buf;
5272                         nbp = getdirtybuf(nbp, &lk, waitfor);
5273                         if (nbp == NULL)
5274                                 continue;
5275                         FREE_LOCK(&lk);
5276                         if (waitfor == MNT_NOWAIT) {
5277                                 bawrite(nbp);
5278                         } else if ((error = bwrite(nbp)) != 0) {
5279                                 break;
5280                         }
5281                         ACQUIRE_LOCK(&lk);
5282                         continue;
5283
5284                 case D_ALLOCINDIR:
5285                         aip = WK_ALLOCINDIR(wk);
5286                         if (aip->ai_state & DEPCOMPLETE)
5287                                 continue;
5288                         nbp = aip->ai_buf;
5289                         nbp = getdirtybuf(nbp, &lk, waitfor);
5290                         if (nbp == NULL)
5291                                 continue;
5292                         FREE_LOCK(&lk);
5293                         if (waitfor == MNT_NOWAIT) {
5294                                 bawrite(nbp);
5295                         } else if ((error = bwrite(nbp)) != 0) {
5296                                 break;
5297                         }
5298                         ACQUIRE_LOCK(&lk);
5299                         continue;
5300
5301                 case D_INDIRDEP:
5302                 restart:
5303
5304                         LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
5305                                 if (aip->ai_state & DEPCOMPLETE)
5306                                         continue;
5307                                 nbp = aip->ai_buf;
5308                                 nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
5309                                 if (nbp == NULL)
5310                                         goto restart;
5311                                 FREE_LOCK(&lk);
5312                                 if ((error = bwrite(nbp)) != 0) {
5313                                         goto loop_end;
5314                                 }
5315                                 ACQUIRE_LOCK(&lk);
5316                                 goto restart;
5317                         }
5318                         continue;
5319
5320                 case D_INODEDEP:
5321                         if ((error = flush_inodedep_deps(wk->wk_mp,
5322                             WK_INODEDEP(wk)->id_ino)) != 0) {
5323                                 FREE_LOCK(&lk);
5324                                 break;
5325                         }
5326                         continue;
5327
5328                 case D_PAGEDEP:
5329                         /*
5330                          * We are trying to sync a directory that may
5331                          * have dependencies on both its own metadata
5332                          * and/or dependencies on the inodes of any
5333                          * recently allocated files. We walk its diradd
5334                          * lists pushing out the associated inode.
5335                          */
5336                         pagedep = WK_PAGEDEP(wk);
5337                         for (i = 0; i < DAHASHSZ; i++) {
5338                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
5339                                         continue;
5340                                 if ((error =
5341                                     flush_pagedep_deps(vp, wk->wk_mp,
5342                                                 &pagedep->pd_diraddhd[i]))) {
5343                                         FREE_LOCK(&lk);
5344                                         goto loop_end;
5345                                 }
5346                         }
5347                         continue;
5348
5349                 case D_MKDIR:
5350                         /*
5351                          * This case should never happen if the vnode has
5352                          * been properly sync'ed. However, if this function
5353                          * is used at a place where the vnode has not yet
5354                          * been sync'ed, this dependency can show up. So,
5355                          * rather than panic, just flush it.
5356                          */
5357                         nbp = WK_MKDIR(wk)->md_buf;
5358                         nbp = getdirtybuf(nbp, &lk, waitfor);
5359                         if (nbp == NULL)
5360                                 continue;
5361                         FREE_LOCK(&lk);
5362                         if (waitfor == MNT_NOWAIT) {
5363                                 bawrite(nbp);
5364                         } else if ((error = bwrite(nbp)) != 0) {
5365                                 break;
5366                         }
5367                         ACQUIRE_LOCK(&lk);
5368                         continue;
5369
5370                 case D_BMSAFEMAP:
5371                         /*
5372                          * This case should never happen if the vnode has
5373                          * been properly sync'ed. However, if this function
5374                          * is used at a place where the vnode has not yet
5375                          * been sync'ed, this dependency can show up. So,
5376                          * rather than panic, just flush it.
5377                          */
5378                         nbp = WK_BMSAFEMAP(wk)->sm_buf;
5379                         nbp = getdirtybuf(nbp, &lk, waitfor);
5380                         if (nbp == NULL)
5381                                 continue;
5382                         FREE_LOCK(&lk);
5383                         if (waitfor == MNT_NOWAIT) {
5384                                 bawrite(nbp);
5385                         } else if ((error = bwrite(nbp)) != 0) {
5386                                 break;
5387                         }
5388                         ACQUIRE_LOCK(&lk);
5389                         continue;
5390
5391                 default:
5392                         panic("softdep_sync_metadata: Unknown type %s",
5393                             TYPENAME(wk->wk_type));
5394                         /* NOTREACHED */
5395                 }
5396         loop_end:
5397                 /* We reach here only in error and unlocked */
5398                 if (error == 0)
5399                         panic("softdep_sync_metadata: zero error");
5400                 BUF_NOREC(bp);
5401                 bawrite(bp);
5402                 return (error);
5403         }
5404         FREE_LOCK(&lk);
5405         BO_LOCK(bo);
5406         while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
5407                 nbp = getdirtybuf(nbp, BO_MTX(bo), MNT_WAIT);
5408                 if (nbp)
5409                         break;
5410         }
5411         BO_UNLOCK(bo);
5412         BUF_NOREC(bp);
5413         bawrite(bp);
5414         if (nbp != NULL) {
5415                 bp = nbp;
5416                 goto loop;
5417         }
5418         /*
5419          * The brief unlock is to allow any pent up dependency
5420          * processing to be done. Then proceed with the second pass.
5421          */
5422         if (waitfor == MNT_NOWAIT) {
5423                 waitfor = MNT_WAIT;
5424                 goto top;
5425         }
5426
5427         /*
5428          * If we have managed to get rid of all the dirty buffers,
5429          * then we are done. For certain directories and block
5430          * devices, we may need to do further work.
5431          *
5432          * We must wait for any I/O in progress to finish so that
5433          * all potential buffers on the dirty list will be visible.
5434          */
5435         BO_LOCK(bo);
5436         drain_output(vp);
5437         BO_UNLOCK(bo);
5438         return (0);
5439 }
5440
5441 /*
5442  * Flush the dependencies associated with an inodedep.
5443  * Called with splbio blocked.
5444  */
5445 static int
5446 flush_inodedep_deps(mp, ino)
5447         struct mount *mp;
5448         ino_t ino;
5449 {
5450         struct inodedep *inodedep;
5451         int error, waitfor;
5452
5453         /*
5454          * This work is done in two passes. The first pass grabs most
5455          * of the buffers and begins asynchronously writing them. The
5456          * only way to wait for these asynchronous writes is to sleep
5457          * on the filesystem vnode which may stay busy for a long time
5458          * if the filesystem is active. So, instead, we make a second
5459          * pass over the dependencies blocking on each write. In the
5460          * usual case we will be blocking against a write that we
5461          * initiated, so when it is done the dependency will have been
5462          * resolved. Thus the second pass is expected to end quickly.
5463          * We give a brief window at the top of the loop to allow
5464          * any pending I/O to complete.
5465          */
5466         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
5467                 if (error)
5468                         return (error);
5469                 FREE_LOCK(&lk);
5470                 ACQUIRE_LOCK(&lk);
5471                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5472                         return (0);
5473                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
5474                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
5475                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
5476                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
5477                         continue;
5478                 /*
5479                  * If pass2, we are done, otherwise do pass 2.
5480                  */
5481                 if (waitfor == MNT_WAIT)
5482                         break;
5483                 waitfor = MNT_WAIT;
5484         }
5485         /*
5486          * Try freeing inodedep in case all dependencies have been removed.
5487          */
5488         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
5489                 (void) free_inodedep(inodedep);
5490         return (0);
5491 }
5492
5493 /*
5494  * Flush an inode dependency list.
5495  * Called with splbio blocked.
5496  */
5497 static int
5498 flush_deplist(listhead, waitfor, errorp)
5499         struct allocdirectlst *listhead;
5500         int waitfor;
5501         int *errorp;
5502 {
5503         struct allocdirect *adp;
5504         struct buf *bp;
5505
5506         mtx_assert(&lk, MA_OWNED);
5507         TAILQ_FOREACH(adp, listhead, ad_next) {
5508                 if (adp->ad_state & DEPCOMPLETE)
5509                         continue;
5510                 bp = adp->ad_buf;
5511                 bp = getdirtybuf(bp, &lk, waitfor);
5512                 if (bp == NULL) {
5513                         if (waitfor == MNT_NOWAIT)
5514                                 continue;
5515                         return (1);
5516                 }
5517                 FREE_LOCK(&lk);
5518                 if (waitfor == MNT_NOWAIT) {
5519                         bawrite(bp);
5520                 } else if ((*errorp = bwrite(bp)) != 0) {
5521                         ACQUIRE_LOCK(&lk);
5522                         return (1);
5523                 }
5524                 ACQUIRE_LOCK(&lk);
5525                 return (1);
5526         }
5527         return (0);
5528 }
5529
5530 /*
5531  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5532  * Called with splbio blocked.
5533  */
5534 static int
5535 flush_pagedep_deps(pvp, mp, diraddhdp)
5536         struct vnode *pvp;
5537         struct mount *mp;
5538         struct diraddhd *diraddhdp;
5539 {
5540         struct inodedep *inodedep;
5541         struct ufsmount *ump;
5542         struct diradd *dap;
5543         struct vnode *vp;
5544         struct bufobj *bo;
5545         int error = 0;
5546         struct buf *bp;
5547         ino_t inum;
5548         struct worklist *wk;
5549
5550         ump = VFSTOUFS(mp);
5551         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5552                 /*
5553                  * Flush ourselves if this directory entry
5554                  * has a MKDIR_PARENT dependency.
5555                  */
5556                 if (dap->da_state & MKDIR_PARENT) {
5557                         FREE_LOCK(&lk);
5558                         if ((error = ffs_update(pvp, 1)) != 0)
5559                                 break;
5560                         ACQUIRE_LOCK(&lk);
5561                         /*
5562                          * If that cleared dependencies, go on to next.
5563                          */
5564                         if (dap != LIST_FIRST(diraddhdp))
5565                                 continue;
5566                         if (dap->da_state & MKDIR_PARENT)
5567                                 panic("flush_pagedep_deps: MKDIR_PARENT");
5568                 }
5569                 /*
5570                  * A newly allocated directory must have its "." and
5571                  * ".." entries written out before its name can be
5572                  * committed in its parent. We do not want or need
5573                  * the full semantics of a synchronous ffs_syncvnode as
5574                  * that may end up here again, once for each directory
5575                  * level in the filesystem. Instead, we push the blocks
5576                  * and wait for them to clear. We have to fsync twice
5577                  * because the first call may choose to defer blocks
5578                  * that still have dependencies, but deferral will
5579                  * happen at most once.
5580                  */
5581                 inum = dap->da_newinum;
5582                 if (dap->da_state & MKDIR_BODY) {
5583                         FREE_LOCK(&lk);
5584                         if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
5585                                 break;
5586                         if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
5587                             (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
5588                                 vput(vp);
5589                                 break;
5590                         }
5591                         bo = &vp->v_bufobj;
5592                         BO_LOCK(bo);
5593                         drain_output(vp);
5594                         /*
5595                          * If first block is still dirty with a D_MKDIR
5596                          * dependency then it needs to be written now.
5597                          */
5598                         for (;;) {
5599                                 error = 0;
5600                                 bp = gbincore(bo, 0);
5601                                 if (bp == NULL)
5602                                         break;  /* First block not present */
5603                                 error = BUF_LOCK(bp,
5604                                                  LK_EXCLUSIVE |
5605                                                  LK_SLEEPFAIL |
5606                                                  LK_INTERLOCK,
5607                                                  BO_MTX(bo));
5608                                 BO_LOCK(bo);
5609                                 if (error == ENOLCK)
5610                                         continue;       /* Slept, retry */
5611                                 if (error != 0)
5612                                         break;          /* Failed */
5613                                 if ((bp->b_flags & B_DELWRI) == 0) {
5614                                         BUF_UNLOCK(bp);
5615                                         break;  /* Buffer not dirty */
5616                                 }
5617                                 for (wk = LIST_FIRST(&bp->b_dep);
5618                                      wk != NULL;
5619                                      wk = LIST_NEXT(wk, wk_list))
5620                                         if (wk->wk_type == D_MKDIR)
5621                                                 break;
5622                                 if (wk == NULL)
5623                                         BUF_UNLOCK(bp); /* Dependency gone */
5624                                 else {
5625                                         /*
5626                                          * D_MKDIR dependency remains,
5627                                          * must write buffer to stable
5628                                          * storage.
5629                                          */
5630                                         BO_UNLOCK(bo);
5631                                         bremfree(bp);
5632                                         error = bwrite(bp);
5633                                         BO_LOCK(bo);
5634                                 }
5635                                 break;
5636                         }
5637                         BO_UNLOCK(bo);
5638                         vput(vp);
5639                         if (error != 0)
5640                                 break;  /* Flushing of first block failed */
5641                         ACQUIRE_LOCK(&lk);
5642                         /*
5643                          * If that cleared dependencies, go on to next.
5644                          */
5645                         if (dap != LIST_FIRST(diraddhdp))
5646                                 continue;
5647                         if (dap->da_state & MKDIR_BODY)
5648                                 panic("flush_pagedep_deps: MKDIR_BODY");
5649                 }
5650                 /*
5651                  * Flush the inode on which the directory entry depends.
5652                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5653                  * the only remaining dependency is that the updated inode
5654                  * count must get pushed to disk. The inode has already
5655                  * been pushed into its inode buffer (via VOP_UPDATE) at
5656                  * the time of the reference count change. So we need only
5657                  * locate that buffer, ensure that there will be no rollback
5658                  * caused by a bitmap dependency, then write the inode buffer.
5659                  */
5660 retry:
5661                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
5662                         panic("flush_pagedep_deps: lost inode");
5663                 /*
5664                  * If the inode still has bitmap dependencies,
5665                  * push them to disk.
5666                  */
5667                 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5668                         bp = inodedep->id_buf;
5669                         bp = getdirtybuf(bp, &lk, MNT_WAIT);
5670                         if (bp == NULL)
5671                                 goto retry;
5672                         FREE_LOCK(&lk);
5673                         if ((error = bwrite(bp)) != 0)
5674                                 break;
5675                         ACQUIRE_LOCK(&lk);
5676                         if (dap != LIST_FIRST(diraddhdp))
5677                                 continue;
5678                 }
5679                 /*
5680                  * If the inode is still sitting in a buffer waiting
5681                  * to be written, push it to disk.
5682                  */
5683                 FREE_LOCK(&lk);
5684                 if ((error = bread(ump->um_devvp,
5685                     fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5686                     (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5687                         brelse(bp);
5688                         break;
5689                 }
5690                 if ((error = bwrite(bp)) != 0)
5691                         break;
5692                 ACQUIRE_LOCK(&lk);
5693                 /*
5694                  * If we have failed to get rid of all the dependencies
5695                  * then something is seriously wrong.
5696                  */
5697                 if (dap == LIST_FIRST(diraddhdp))
5698                         panic("flush_pagedep_deps: flush failed");
5699         }
5700         if (error)
5701                 ACQUIRE_LOCK(&lk);
5702         return (error);
5703 }
5704
5705 /*
5706  * A large burst of file addition or deletion activity can drive the
5707  * memory load excessively high. First attempt to slow things down
5708  * using the techniques below. If that fails, this routine requests
5709  * the offending operations to fall back to running synchronously
5710  * until the memory load returns to a reasonable level.
5711  */
5712 int
5713 softdep_slowdown(vp)
5714         struct vnode *vp;
5715 {
5716         int max_softdeps_hard;
5717
5718         ACQUIRE_LOCK(&lk);
5719         max_softdeps_hard = max_softdeps * 11 / 10;
5720         if (num_dirrem < max_softdeps_hard / 2 &&
5721             num_inodedep < max_softdeps_hard &&
5722             VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) {
5723                 FREE_LOCK(&lk);
5724                 return (0);
5725         }
5726         if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
5727                 softdep_speedup();
5728         stat_sync_limit_hit += 1;
5729         FREE_LOCK(&lk);
5730         return (1);
5731 }
5732
5733 /*
5734  * Called by the allocation routines when they are about to fail
5735  * in the hope that we can free up some disk space.
5736  *
5737  * First check to see if the work list has anything on it. If it has,
5738  * clean up entries until we successfully free some space. Because this
5739  * process holds inodes locked, we cannot handle any remove requests
5740  * that might block on a locked inode as that could lead to deadlock.
5741  * If the worklist yields no free space, encourage the syncer daemon
5742  * to help us. In no event will we try for longer than tickdelay seconds.
5743  */
5744 int
5745 softdep_request_cleanup(fs, vp)
5746         struct fs *fs;
5747         struct vnode *vp;
5748 {
5749         struct ufsmount *ump;
5750         long starttime;
5751         ufs2_daddr_t needed;
5752         int error;
5753
5754         ump = VTOI(vp)->i_ump;
5755         mtx_assert(UFS_MTX(ump), MA_OWNED);
5756         needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5757         starttime = time_second + tickdelay;
5758         /*
5759          * If we are being called because of a process doing a
5760          * copy-on-write, then it is not safe to update the vnode
5761          * as we may recurse into the copy-on-write routine.
5762          */
5763         if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
5764                 UFS_UNLOCK(ump);
5765                 error = ffs_update(vp, 1);
5766                 UFS_LOCK(ump);
5767                 if (error != 0)
5768                         return (0);
5769         }
5770         while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5771                 if (time_second > starttime)
5772                         return (0);
5773                 UFS_UNLOCK(ump);
5774                 ACQUIRE_LOCK(&lk);
5775                 if (ump->softdep_on_worklist > 0 &&
5776                     process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
5777                         stat_worklist_push += 1;
5778                         FREE_LOCK(&lk);
5779                         UFS_LOCK(ump);
5780                         continue;
5781                 }
5782                 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
5783                 FREE_LOCK(&lk);
5784                 UFS_LOCK(ump);
5785         }
5786         return (1);
5787 }
5788
5789 /*
5790  * If memory utilization has gotten too high, deliberately slow things
5791  * down and speed up the I/O processing.
5792  */
5793 extern struct thread *syncertd;
5794 static int
5795 request_cleanup(mp, resource)
5796         struct mount *mp;
5797         int resource;
5798 {
5799         struct thread *td = curthread;
5800         struct ufsmount *ump;
5801
5802         mtx_assert(&lk, MA_OWNED);
5803         /*
5804          * We never hold up the filesystem syncer or buf daemon.
5805          */
5806         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
5807                 return (0);
5808         ump = VFSTOUFS(mp);
5809         /*
5810          * First check to see if the work list has gotten backlogged.
5811          * If it has, co-opt this process to help clean up two entries.
5812          * Because this process may hold inodes locked, we cannot
5813          * handle any remove requests that might block on a locked
5814          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
5815          * to avoid recursively processing the worklist.
5816          */
5817         if (ump->softdep_on_worklist > max_softdeps / 10) {
5818                 td->td_pflags |= TDP_SOFTDEP;
5819                 process_worklist_item(mp, LK_NOWAIT);
5820                 process_worklist_item(mp, LK_NOWAIT);
5821                 td->td_pflags &= ~TDP_SOFTDEP;
5822                 stat_worklist_push += 2;
5823                 return(1);
5824         }
5825         /*
5826          * Next, we attempt to speed up the syncer process. If that
5827          * is successful, then we allow the process to continue.
5828          */
5829         if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
5830                 return(0);
5831         /*
5832          * If we are resource constrained on inode dependencies, try
5833          * flushing some dirty inodes. Otherwise, we are constrained
5834          * by file deletions, so try accelerating flushes of directories
5835          * with removal dependencies. We would like to do the cleanup
5836          * here, but we probably hold an inode locked at this point and
5837          * that might deadlock against one that we try to clean. So,
5838          * the best that we can do is request the syncer daemon to do
5839          * the cleanup for us.
5840          */
5841         switch (resource) {
5842
5843         case FLUSH_INODES:
5844                 stat_ino_limit_push += 1;
5845                 req_clear_inodedeps += 1;
5846                 stat_countp = &stat_ino_limit_hit;
5847                 break;
5848
5849         case FLUSH_REMOVE:
5850         case FLUSH_REMOVE_WAIT:
5851                 stat_blk_limit_push += 1;
5852                 req_clear_remove += 1;
5853                 stat_countp = &stat_blk_limit_hit;
5854                 break;
5855
5856         default:
5857                 panic("request_cleanup: unknown type");
5858         }
5859         /*
5860          * Hopefully the syncer daemon will catch up and awaken us.
5861          * We wait at most tickdelay before proceeding in any case.
5862          */
5863         proc_waiting += 1;
5864         if (callout_pending(&softdep_callout) == FALSE)
5865                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
5866                     pause_timer, 0);
5867
5868         msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
5869         proc_waiting -= 1;
5870         return (1);
5871 }
5872
5873 /*
5874  * Awaken processes pausing in request_cleanup and clear proc_waiting
5875  * to indicate that there is no longer a timer running.
5876  */
5877 static void
5878 pause_timer(arg)
5879         void *arg;
5880 {
5881
5882         /*
5883          * The callout_ API has acquired mtx and will hold it around this
5884          * function call.
5885          */
5886         *stat_countp += 1;
5887         wakeup_one(&proc_waiting);
5888         if (proc_waiting > 0)
5889                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
5890                     pause_timer, 0);
5891 }
5892
5893 /*
5894  * Flush out a directory with at least one removal dependency in an effort to
5895  * reduce the number of dirrem, freefile, and freeblks dependency structures.
5896  */
5897 static void
5898 clear_remove(td)
5899         struct thread *td;
5900 {
5901         struct pagedep_hashhead *pagedephd;
5902         struct pagedep *pagedep;
5903         static int next = 0;
5904         struct mount *mp;
5905         struct vnode *vp;
5906         struct bufobj *bo;
5907         int error, cnt;
5908         ino_t ino;
5909
5910         mtx_assert(&lk, MA_OWNED);
5911
5912         for (cnt = 0; cnt < pagedep_hash; cnt++) {
5913                 pagedephd = &pagedep_hashtbl[next++];
5914                 if (next >= pagedep_hash)
5915                         next = 0;
5916                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5917                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
5918                                 continue;
5919                         mp = pagedep->pd_list.wk_mp;
5920                         ino = pagedep->pd_ino;
5921                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5922                                 continue;
5923                         FREE_LOCK(&lk);
5924                         if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
5925                                 softdep_error("clear_remove: vget", error);
5926                                 vn_finished_write(mp);
5927                                 ACQUIRE_LOCK(&lk);
5928                                 return;
5929                         }
5930                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5931                                 softdep_error("clear_remove: fsync", error);
5932                         bo = &vp->v_bufobj;
5933                         BO_LOCK(bo);
5934                         drain_output(vp);
5935                         BO_UNLOCK(bo);
5936                         vput(vp);
5937                         vn_finished_write(mp);
5938                         ACQUIRE_LOCK(&lk);
5939                         return;
5940                 }
5941         }
5942 }
5943
5944 /*
5945  * Clear out a block of dirty inodes in an effort to reduce
5946  * the number of inodedep dependency structures.
5947  */
5948 static void
5949 clear_inodedeps(td)
5950         struct thread *td;
5951 {
5952         struct inodedep_hashhead *inodedephd;
5953         struct inodedep *inodedep;
5954         static int next = 0;
5955         struct mount *mp;
5956         struct vnode *vp;
5957         struct fs *fs;
5958         int error, cnt;
5959         ino_t firstino, lastino, ino;
5960
5961         mtx_assert(&lk, MA_OWNED);
5962         /*
5963          * Pick a random inode dependency to be cleared.
5964          * We will then gather up all the inodes in its block
5965          * that have dependencies and flush them out.
5966          */
5967         for (cnt = 0; cnt < inodedep_hash; cnt++) {
5968                 inodedephd = &inodedep_hashtbl[next++];
5969                 if (next >= inodedep_hash)
5970                         next = 0;
5971                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5972                         break;
5973         }
5974         if (inodedep == NULL)
5975                 return;
5976         fs = inodedep->id_fs;
5977         mp = inodedep->id_list.wk_mp;
5978         /*
5979          * Find the last inode in the block with dependencies.
5980          */
5981         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5982         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5983                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
5984                         break;
5985         /*
5986          * Asynchronously push all but the last inode with dependencies.
5987          * Synchronously push the last inode with dependencies to ensure
5988          * that the inode block gets written to free up the inodedeps.
5989          */
5990         for (ino = firstino; ino <= lastino; ino++) {
5991                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5992                         continue;
5993                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5994                         continue;
5995                 FREE_LOCK(&lk);
5996                 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
5997                         softdep_error("clear_inodedeps: vget", error);
5998                         vn_finished_write(mp);
5999                         ACQUIRE_LOCK(&lk);
6000                         return;
6001                 }
6002                 if (ino == lastino) {
6003                         if ((error = ffs_syncvnode(vp, MNT_WAIT)))
6004                                 softdep_error("clear_inodedeps: fsync1", error);
6005                 } else {
6006                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
6007                                 softdep_error("clear_inodedeps: fsync2", error);
6008                         BO_LOCK(&vp->v_bufobj);
6009                         drain_output(vp);
6010                         BO_UNLOCK(&vp->v_bufobj);
6011                 }
6012                 vput(vp);
6013                 vn_finished_write(mp);
6014                 ACQUIRE_LOCK(&lk);
6015         }
6016 }
6017
6018 /*
6019  * Function to determine if the buffer has outstanding dependencies
6020  * that will cause a roll-back if the buffer is written. If wantcount
6021  * is set, return number of dependencies, otherwise just yes or no.
6022  */
6023 static int
6024 softdep_count_dependencies(bp, wantcount)
6025         struct buf *bp;
6026         int wantcount;
6027 {
6028         struct worklist *wk;
6029         struct inodedep *inodedep;
6030         struct indirdep *indirdep;
6031         struct allocindir *aip;
6032         struct pagedep *pagedep;
6033         struct diradd *dap;
6034         int i, retval;
6035
6036         retval = 0;
6037         ACQUIRE_LOCK(&lk);
6038         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6039                 switch (wk->wk_type) {
6040
6041                 case D_INODEDEP:
6042                         inodedep = WK_INODEDEP(wk);
6043                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
6044                                 /* bitmap allocation dependency */
6045                                 retval += 1;
6046                                 if (!wantcount)
6047                                         goto out;
6048                         }
6049                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
6050                                 /* direct block pointer dependency */
6051                                 retval += 1;
6052                                 if (!wantcount)
6053                                         goto out;
6054                         }
6055                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
6056                                 /* direct block pointer dependency */
6057                                 retval += 1;
6058                                 if (!wantcount)
6059                                         goto out;
6060                         }
6061                         continue;
6062
6063                 case D_INDIRDEP:
6064                         indirdep = WK_INDIRDEP(wk);
6065
6066                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
6067                                 /* indirect block pointer dependency */
6068                                 retval += 1;
6069                                 if (!wantcount)
6070                                         goto out;
6071                         }
6072                         continue;
6073
6074                 case D_PAGEDEP:
6075                         pagedep = WK_PAGEDEP(wk);
6076                         for (i = 0; i < DAHASHSZ; i++) {
6077
6078                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
6079                                         /* directory entry dependency */
6080                                         retval += 1;
6081                                         if (!wantcount)
6082                                                 goto out;
6083                                 }
6084                         }
6085                         continue;
6086
6087                 case D_BMSAFEMAP:
6088                 case D_ALLOCDIRECT:
6089                 case D_ALLOCINDIR:
6090                 case D_MKDIR:
6091                         /* never a dependency on these blocks */
6092                         continue;
6093
6094                 default:
6095                         panic("softdep_check_for_rollback: Unexpected type %s",
6096                             TYPENAME(wk->wk_type));
6097                         /* NOTREACHED */
6098                 }
6099         }
6100 out:
6101         FREE_LOCK(&lk);
6102         return retval;
6103 }
6104
6105 /*
6106  * Acquire exclusive access to a buffer.
6107  * Must be called with a locked mtx parameter.
6108  * Return acquired buffer or NULL on failure.
6109  */
6110 static struct buf *
6111 getdirtybuf(bp, mtx, waitfor)
6112         struct buf *bp;
6113         struct mtx *mtx;
6114         int waitfor;
6115 {
6116         int error;
6117
6118         mtx_assert(mtx, MA_OWNED);
6119         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
6120                 if (waitfor != MNT_WAIT)
6121                         return (NULL);
6122                 error = BUF_LOCK(bp,
6123                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
6124                 /*
6125                  * Even if we sucessfully acquire bp here, we have dropped
6126                  * mtx, which may violates our guarantee.
6127                  */
6128                 if (error == 0)
6129                         BUF_UNLOCK(bp);
6130                 else if (error != ENOLCK)
6131                         panic("getdirtybuf: inconsistent lock: %d", error);
6132                 mtx_lock(mtx);
6133                 return (NULL);
6134         }
6135         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6136                 if (mtx == &lk && waitfor == MNT_WAIT) {
6137                         mtx_unlock(mtx);
6138                         BO_LOCK(bp->b_bufobj);
6139                         BUF_UNLOCK(bp);
6140                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6141                                 bp->b_vflags |= BV_BKGRDWAIT;
6142                                 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
6143                                        PRIBIO | PDROP, "getbuf", 0);
6144                         } else
6145                                 BO_UNLOCK(bp->b_bufobj);
6146                         mtx_lock(mtx);
6147                         return (NULL);
6148                 }
6149                 BUF_UNLOCK(bp);
6150                 if (waitfor != MNT_WAIT)
6151                         return (NULL);
6152                 /*
6153                  * The mtx argument must be bp->b_vp's mutex in
6154                  * this case.
6155                  */
6156 #ifdef  DEBUG_VFS_LOCKS
6157                 if (bp->b_vp->v_type != VCHR)
6158                         ASSERT_BO_LOCKED(bp->b_bufobj);
6159 #endif
6160                 bp->b_vflags |= BV_BKGRDWAIT;
6161                 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
6162                 return (NULL);
6163         }
6164         if ((bp->b_flags & B_DELWRI) == 0) {
6165                 BUF_UNLOCK(bp);
6166                 return (NULL);
6167         }
6168         bremfree(bp);
6169         return (bp);
6170 }
6171
6172
6173 /*
6174  * Check if it is safe to suspend the file system now.  On entry,
6175  * the vnode interlock for devvp should be held.  Return 0 with
6176  * the mount interlock held if the file system can be suspended now,
6177  * otherwise return EAGAIN with the mount interlock held.
6178  */
6179 int
6180 softdep_check_suspend(struct mount *mp,
6181                       struct vnode *devvp,
6182                       int softdep_deps,
6183                       int softdep_accdeps,
6184                       int secondary_writes,
6185                       int secondary_accwrites)
6186 {
6187         struct bufobj *bo;
6188         struct ufsmount *ump;
6189         int error;
6190
6191         ump = VFSTOUFS(mp);
6192         bo = &devvp->v_bufobj;
6193         ASSERT_BO_LOCKED(bo);
6194
6195         for (;;) {
6196                 if (!TRY_ACQUIRE_LOCK(&lk)) {
6197                         BO_UNLOCK(bo);
6198                         ACQUIRE_LOCK(&lk);
6199                         FREE_LOCK(&lk);
6200                         BO_LOCK(bo);
6201                         continue;
6202                 }
6203                 MNT_ILOCK(mp);
6204                 if (mp->mnt_secondary_writes != 0) {
6205                         FREE_LOCK(&lk);
6206                         BO_UNLOCK(bo);
6207                         msleep(&mp->mnt_secondary_writes,
6208                                MNT_MTX(mp),
6209                                (PUSER - 1) | PDROP, "secwr", 0);
6210                         BO_LOCK(bo);
6211                         continue;
6212                 }
6213                 break;
6214         }
6215
6216         /*
6217          * Reasons for needing more work before suspend:
6218          * - Dirty buffers on devvp.
6219          * - Softdep activity occurred after start of vnode sync loop
6220          * - Secondary writes occurred after start of vnode sync loop
6221          */
6222         error = 0;
6223         if (bo->bo_numoutput > 0 ||
6224             bo->bo_dirty.bv_cnt > 0 ||
6225             softdep_deps != 0 ||
6226             ump->softdep_deps != 0 ||
6227             softdep_accdeps != ump->softdep_accdeps ||
6228             secondary_writes != 0 ||
6229             mp->mnt_secondary_writes != 0 ||
6230             secondary_accwrites != mp->mnt_secondary_accwrites)
6231                 error = EAGAIN;
6232         FREE_LOCK(&lk);
6233         BO_UNLOCK(bo);
6234         return (error);
6235 }
6236
6237
6238 /*
6239  * Get the number of dependency structures for the file system, both
6240  * the current number and the total number allocated.  These will
6241  * later be used to detect that softdep processing has occurred.
6242  */
6243 void
6244 softdep_get_depcounts(struct mount *mp,
6245                       int *softdep_depsp,
6246                       int *softdep_accdepsp)
6247 {
6248         struct ufsmount *ump;
6249
6250         ump = VFSTOUFS(mp);
6251         ACQUIRE_LOCK(&lk);
6252         *softdep_depsp = ump->softdep_deps;
6253         *softdep_accdepsp = ump->softdep_accdeps;
6254         FREE_LOCK(&lk);
6255 }
6256
6257 /*
6258  * Wait for pending output on a vnode to complete.
6259  * Must be called with vnode lock and interlock locked.
6260  *
6261  * XXX: Should just be a call to bufobj_wwait().
6262  */
6263 static void
6264 drain_output(vp)
6265         struct vnode *vp;
6266 {
6267         struct bufobj *bo;
6268
6269         bo = &vp->v_bufobj;
6270         ASSERT_VOP_LOCKED(vp, "drain_output");
6271         ASSERT_BO_LOCKED(bo);
6272
6273         while (bo->bo_numoutput) {
6274                 bo->bo_flag |= BO_WWAIT;
6275                 msleep((caddr_t)&bo->bo_numoutput,
6276                     BO_MTX(bo), PRIBIO + 1, "drainvp", 0);
6277         }
6278 }
6279
6280 /*
6281  * Called whenever a buffer that is being invalidated or reallocated
6282  * contains dependencies. This should only happen if an I/O error has
6283  * occurred. The routine is called with the buffer locked.
6284  */
6285 static void
6286 softdep_deallocate_dependencies(bp)
6287         struct buf *bp;
6288 {
6289
6290         if ((bp->b_ioflags & BIO_ERROR) == 0)
6291                 panic("softdep_deallocate_dependencies: dangling deps");
6292         softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
6293         panic("softdep_deallocate_dependencies: unrecovered I/O error");
6294 }
6295
6296 /*
6297  * Function to handle asynchronous write errors in the filesystem.
6298  */
6299 static void
6300 softdep_error(func, error)
6301         char *func;
6302         int error;
6303 {
6304
6305         /* XXX should do something better! */
6306         printf("%s: got error %d while accessing filesystem\n", func, error);
6307 }
6308
6309 #endif /* SOFTUPDATES */