sys/gnu/fs/xfs/xfs_inode.c

   1 /*
   2  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of version 2 of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11  *
  12  * Further, this software is distributed without any warranty that it is
  13  * free of the rightful claim of any third person regarding infringement
  14  * or the like.  Any license provided herein, whether implied or
  15  * otherwise, applies only to this software file.  Patent licenses, if
  16  * any, provided herein do not apply to combinations of this program with
  17  * other software, or any other product whatsoever.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write the Free Software Foundation, Inc., 59
  21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22  *
  23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24  * Mountain View, CA  94043, or:
  25  *
  26  * http://www.sgi.com
  27  *
  28  * For further information regarding this notice, see:
  29  *
  30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31  */
  32
  33 #include "xfs.h"
  34 #include "xfs_macros.h"
  35 #include "xfs_types.h"
  36 #include "xfs_inum.h"
  37 #include "xfs_log.h"
  38 #include "xfs_trans.h"
  39 #include "xfs_trans_priv.h"
  40 #include "xfs_sb.h"
  41 #include "xfs_ag.h"
  42 #include "xfs_dir.h"
  43 #include "xfs_dir2.h"
  44 #include "xfs_dmapi.h"
  45 #include "xfs_mount.h"
  46 #include "xfs_alloc_btree.h"
  47 #include "xfs_bmap_btree.h"
  48 #include "xfs_ialloc_btree.h"
  49 #include "xfs_btree.h"
  50 #include "xfs_imap.h"
  51 #include "xfs_alloc.h"
  52 #include "xfs_ialloc.h"
  53 #include "xfs_attr_sf.h"
  54 #include "xfs_dir_sf.h"
  55 #include "xfs_dir2_sf.h"
  56 #include "xfs_dinode.h"
  57 #include "xfs_inode_item.h"
  58 #include "xfs_inode.h"
  59 #include "xfs_bmap.h"
  60 #include "xfs_buf_item.h"
  61 #include "xfs_rw.h"
  62 #include "xfs_error.h"
  63 #include "xfs_bit.h"
  64 #include "xfs_utils.h"
  65 #include "xfs_dir2_trace.h"
  66 #include "xfs_quota.h"
  67 #include "xfs_mac.h"
  68 #include "xfs_acl.h"
  69
  70
  71 kmem_zone_t *xfs_ifork_zone;
  72 kmem_zone_t *xfs_inode_zone;
  73 kmem_zone_t *xfs_chashlist_zone;
  74
  75 /*
  76  * Used in xfs_itruncate().  This is the maximum number of extents
  77  * freed from a file in a single transaction.
  78  */
  79 #define XFS_ITRUNC_MAX_EXTENTS  2
  80
  81 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  82 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
  83 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
  84 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  85
  86
  87 #ifdef DEBUG
  88 /*
  89  * Make sure that the extents in the given memory buffer
  90  * are valid.
  91  */
  92 STATIC void
  93 xfs_validate_extents(
  94         xfs_bmbt_rec_t          *ep,
  95         int                     nrecs,
  96         int                     disk,
  97         xfs_exntfmt_t           fmt)
  98 {
  99         xfs_bmbt_irec_t         irec;
 100         xfs_bmbt_rec_t          rec;
 101         int                     i;
 102
 103         for (i = 0; i < nrecs; i++) {
 104                 rec.l0 = get_unaligned((__uint64_t*)&ep->l0);
 105                 rec.l1 = get_unaligned((__uint64_t*)&ep->l1);
 106                 if (disk)
 107                         xfs_bmbt_disk_get_all(&rec, &irec);
 108                 else
 109                         xfs_bmbt_get_all(&rec, &irec);
 110                 if (fmt == XFS_EXTFMT_NOSTATE)
 111                         ASSERT(irec.br_state == XFS_EXT_NORM);
 112                 ep++;
 113         }
 114 }
 115 #else /* DEBUG */
 116 #define xfs_validate_extents(ep, nrecs, disk, fmt)
 117 #endif /* DEBUG */
 118
 119 /*
 120  * Check that none of the inode's in the buffer have a next
 121  * unlinked field of 0.
 122  */
 123 #if defined(DEBUG)
 124 void
 125 xfs_inobp_check(
 126         xfs_mount_t     *mp,
 127         xfs_buf_t       *bp)
 128 {
 129         int             i;
 130         int             j;
 131         xfs_dinode_t    *dip;
 132
 133         j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 134
 135         for (i = 0; i < j; i++) {
 136                 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 137                                         i * mp->m_sb.sb_inodesize);
 138                 if (INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT))  {
 139                         xfs_fs_cmn_err(CE_ALERT, mp,
 140                                 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
 141                                 bp);
 142                         ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT));
 143                 }
 144         }
 145 }
 146 #endif
 147
 148 /*
 149  * This routine is called to map an inode number within a file
 150  * system to the buffer containing the on-disk version of the
 151  * inode.  It returns a pointer to the buffer containing the
 152  * on-disk inode in the bpp parameter, and in the dip parameter
 153  * it returns a pointer to the on-disk inode within that buffer.
 154  *
 155  * If a non-zero error is returned, then the contents of bpp and
 156  * dipp are undefined.
 157  *
 158  * Use xfs_imap() to determine the size and location of the
 159  * buffer to read from disk.
 160  */
 161 int
 162 xfs_inotobp(
 163         xfs_mount_t     *mp,
 164         xfs_trans_t     *tp,
 165         xfs_ino_t       ino,
 166         xfs_dinode_t    **dipp,
 167         xfs_buf_t       **bpp,
 168         int             *offset)
 169 {
 170         int             di_ok;
 171         xfs_imap_t      imap;
 172         xfs_buf_t       *bp;
 173         int             error;
 174         xfs_dinode_t    *dip;
 175
 176         /*
 177          * Call the space managment code to find the location of the
 178          * inode on disk.
 179          */
 180         imap.im_blkno = 0;
 181         error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
 182         if (error != 0) {
 183                 cmn_err(CE_WARN,
 184         "xfs_inotobp: xfs_imap()  returned an "
 185         "error %d on %s.  Returning error.", error, mp->m_fsname);
 186                 return error;
 187         }
 188
 189         /*
 190          * If the inode number maps to a block outside the bounds of the
 191          * file system then return NULL rather than calling read_buf
 192          * and panicing when we get an error from the driver.
 193          */
 194         if ((imap.im_blkno + imap.im_len) >
 195             XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
 196                 cmn_err(CE_WARN,
 197         "xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds "
 198         "of the file system %s.  Returning EINVAL.",
 199                         imap.im_blkno, imap.im_len,mp->m_fsname);
 200                 return XFS_ERROR(EINVAL);
 201         }
 202
 203         /*
 204          * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
 205          * default to just a read_buf() call.
 206          */
 207         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
 208                                    (int)imap.im_len, XFS_BUF_LOCK, &bp);
 209
 210         if (error) {
 211                 cmn_err(CE_WARN,
 212         "xfs_inotobp: xfs_trans_read_buf()  returned an "
 213         "error %d on %s.  Returning error.", error, mp->m_fsname);
 214                 return error;
 215         }
 216         dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
 217         di_ok =
 218                 INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
 219                 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
 220         if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
 221                         XFS_RANDOM_ITOBP_INOTOBP))) {
 222                 XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
 223                 xfs_trans_brelse(tp, bp);
 224                 cmn_err(CE_WARN,
 225         "xfs_inotobp: XFS_TEST_ERROR()  returned an "
 226         "error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
 227                 return XFS_ERROR(EFSCORRUPTED);
 228         }
 229
 230         xfs_inobp_check(mp, bp);
 231
 232         /*
 233          * Set *dipp to point to the on-disk inode in the buffer.
 234          */
 235         *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 236         *bpp = bp;
 237         *offset = imap.im_boffset;
 238         return 0;
 239 }
 240
 241
 242 /*
 243  * This routine is called to map an inode to the buffer containing
 244  * the on-disk version of the inode.  It returns a pointer to the
 245  * buffer containing the on-disk inode in the bpp parameter, and in
 246  * the dip parameter it returns a pointer to the on-disk inode within
 247  * that buffer.
 248  *
 249  * If a non-zero error is returned, then the contents of bpp and
 250  * dipp are undefined.
 251  *
 252  * If the inode is new and has not yet been initialized, use xfs_imap()
 253  * to determine the size and location of the buffer to read from disk.
 254  * If the inode has already been mapped to its buffer and read in once,
 255  * then use the mapping information stored in the inode rather than
 256  * calling xfs_imap().  This allows us to avoid the overhead of looking
 257  * at the inode btree for small block file systems (see xfs_dilocate()).
 258  * We can tell whether the inode has been mapped in before by comparing
 259  * its disk block address to 0.  Only uninitialized inodes will have
 260  * 0 for the disk block address.
 261  */
 262 int
 263 xfs_itobp(
 264         xfs_mount_t     *mp,
 265         xfs_trans_t     *tp,
 266         xfs_inode_t     *ip,
 267         xfs_dinode_t    **dipp,
 268         xfs_buf_t       **bpp,
 269         xfs_daddr_t     bno)
 270 {
 271         xfs_buf_t       *bp;
 272         int             error;
 273         xfs_imap_t      imap;
 274 #ifdef __KERNEL__
 275         int             i;
 276         int             ni;
 277 #endif
 278
 279         if (ip->i_blkno == (xfs_daddr_t)0) {
 280                 /*
 281                  * Call the space management code to find the location of the
 282                  * inode on disk.
 283                  */
 284                 imap.im_blkno = bno;
 285                 error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
 286                 if (error != 0) {
 287                         return error;
 288                 }
 289
 290                 /*
 291                  * If the inode number maps to a block outside the bounds
 292                  * of the file system then return NULL rather than calling
 293                  * read_buf and panicing when we get an error from the
 294                  * driver.
 295                  */
 296                 if ((imap.im_blkno + imap.im_len) >
 297                     XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
 298 #ifdef DEBUG
 299                         xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
 300                                         "(imap.im_blkno (0x%llx) "
 301                                         "+ imap.im_len (0x%llx)) > "
 302                                         " XFS_FSB_TO_BB(mp, "
 303                                         "mp->m_sb.sb_dblocks) (0x%llx)",
 304                                         (unsigned long long) imap.im_blkno,
 305                                         (unsigned long long) imap.im_len,
 306                                         XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
 307 #endif /* DEBUG */
 308                         return XFS_ERROR(EINVAL);
 309                 }
 310
 311                 /*
 312                  * Fill in the fields in the inode that will be used to
 313                  * map the inode to its buffer from now on.
 314                  */
 315                 ip->i_blkno = imap.im_blkno;
 316                 ip->i_len = imap.im_len;
 317                 ip->i_boffset = imap.im_boffset;
 318         } else {
 319                 /*
 320                  * We've already mapped the inode once, so just use the
 321                  * mapping that we saved the first time.
 322                  */
 323                 imap.im_blkno = ip->i_blkno;
 324                 imap.im_len = ip->i_len;
 325                 imap.im_boffset = ip->i_boffset;
 326         }
 327         ASSERT(bno == 0 || bno == imap.im_blkno);
 328
 329         /*
 330          * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
 331          * default to just a read_buf() call.
 332          */
 333         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
 334                                    (int)imap.im_len, XFS_BUF_LOCK, &bp);
 335
 336         if (error) {
 337 #ifdef DEBUG
 338                 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
 339                                 "xfs_trans_read_buf() returned error %d, "
 340                                 "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
 341                                 error, (unsigned long long) imap.im_blkno,
 342                                 (unsigned long long) imap.im_len);
 343 #endif /* DEBUG */
 344                 return error;
 345         }
 346 #ifdef __KERNEL__
 347         /*
 348          * Validate the magic number and version of every inode in the buffer
 349          * (if DEBUG kernel) or the first inode in the buffer, otherwise.
 350          */
 351 #ifdef DEBUG
 352         ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
 353 #else
 354         ni = 1;
 355 #endif
 356         for (i = 0; i < ni; i++) {
 357                 int             di_ok;
 358                 xfs_dinode_t    *dip;
 359
 360                 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 361                                         (i << mp->m_sb.sb_inodelog));
 362                 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
 363                             XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
 364                 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
 365                                  XFS_RANDOM_ITOBP_INOTOBP))) {
 366 #ifdef DEBUG
 367                         prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
 368                                 mp->m_ddev_targp,
 369                                 (unsigned long long)imap.im_blkno, i,
 370                                 INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
 371 #endif
 372                         XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
 373                                              mp, dip);
 374                         xfs_trans_brelse(tp, bp);
 375                         return XFS_ERROR(EFSCORRUPTED);
 376                 }
 377         }
 378 #endif  /* __KERNEL__ */
 379
 380         xfs_inobp_check(mp, bp);
 381
 382         /*
 383          * Mark the buffer as an inode buffer now that it looks good
 384          */
 385         XFS_BUF_SET_VTYPE(bp, B_FS_INO);
 386
 387         /*
 388          * Set *dipp to point to the on-disk inode in the buffer.
 389          */
 390         *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 391         *bpp = bp;
 392         return 0;
 393 }
 394
 395 /*
 396  * Move inode type and inode format specific information from the
 397  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
 398  * this means set if_rdev to the proper value.  For files, directories,
 399  * and symlinks this means to bring in the in-line data or extent
 400  * pointers.  For a file in B-tree format, only the root is immediately
 401  * brought in-core.  The rest will be in-lined in if_extents when it
 402  * is first referenced (see xfs_iread_extents()).
 403  */
 404 STATIC int
 405 xfs_iformat(
 406         xfs_inode_t             *ip,
 407         xfs_dinode_t            *dip)
 408 {
 409         xfs_attr_shortform_t    *atp;
 410         int                     size;
 411         int                     error;
 412         xfs_fsize_t             di_size;
 413         ip->i_df.if_ext_max =
 414                 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 415         error = 0;
 416
 417         if (unlikely(
 418             INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
 419                 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
 420             INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
 421                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 422                         "corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
 423                         "  Unmount and run xfs_repair.",
 424                         (unsigned long long)ip->i_ino,
 425                         (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
 426                             + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
 427                         (unsigned long long)
 428                         INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
 429                 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 430                                      ip->i_mount, dip);
 431                 return XFS_ERROR(EFSCORRUPTED);
 432         }
 433
 434         if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
 435                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 436                         "corrupt dinode %Lu, forkoff = 0x%x."
 437                         "  Unmount and run xfs_repair.",
 438                         (unsigned long long)ip->i_ino,
 439                         (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
 440                 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 441                                      ip->i_mount, dip);
 442                 return XFS_ERROR(EFSCORRUPTED);
 443         }
 444
 445         switch (ip->i_d.di_mode & S_IFMT) {
 446         case S_IFIFO:
 447         case S_IFCHR:
 448         case S_IFBLK:
 449         case S_IFSOCK:
 450                 if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) {
 451                         XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 452                                               ip->i_mount, dip);
 453                         return XFS_ERROR(EFSCORRUPTED);
 454                 }
 455                 ip->i_d.di_size = 0;
 456                 ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
 457                 break;
 458
 459         case S_IFREG:
 460         case S_IFLNK:
 461         case S_IFDIR:
 462                 switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
 463                 case XFS_DINODE_FMT_LOCAL:
 464                         /*
 465                          * no local regular files yet
 466                          */
 467                         if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
 468                                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 469                                         "corrupt inode (local format for regular file) %Lu.  Unmount and run xfs_repair.",
 470                                         (unsigned long long) ip->i_ino);
 471                                 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 472                                                      XFS_ERRLEVEL_LOW,
 473                                                      ip->i_mount, dip);
 474                                 return XFS_ERROR(EFSCORRUPTED);
 475                         }
 476
 477                         di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
 478                         if (unlikely(di_size >
 479                             XFS_DFORK_DSIZE_ARCH(dip, ip->i_mount, ARCH_CONVERT))) {
 480                                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 481                                         "corrupt inode %Lu (bad size %Ld for local inode).  Unmount and run xfs_repair.",
 482                                         (unsigned long long) ip->i_ino,
 483                                         (long long) di_size);
 484                                 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 485                                                      XFS_ERRLEVEL_LOW,
 486                                                      ip->i_mount, dip);
 487                                 return XFS_ERROR(EFSCORRUPTED);
 488                         }
 489
 490                         size = (int)di_size;
 491                         error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 492                         break;
 493                 case XFS_DINODE_FMT_EXTENTS:
 494                         error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 495                         break;
 496                 case XFS_DINODE_FMT_BTREE:
 497                         error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 498                         break;
 499                 default:
 500                         XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 501                                          ip->i_mount);
 502                         return XFS_ERROR(EFSCORRUPTED);
 503                 }
 504                 break;
 505
 506         default:
 507                 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 508                 return XFS_ERROR(EFSCORRUPTED);
 509         }
 510         if (error) {
 511                 return error;
 512         }
 513         if (!XFS_DFORK_Q_ARCH(dip, ARCH_CONVERT))
 514                 return 0;
 515         ASSERT(ip->i_afp == NULL);
 516         ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
 517         ip->i_afp->if_ext_max =
 518                 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 519         switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
 520         case XFS_DINODE_FMT_LOCAL:
 521                 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR_ARCH(dip, ARCH_CONVERT);
 522                 size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
 523                 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 524                 break;
 525         case XFS_DINODE_FMT_EXTENTS:
 526                 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 527                 break;
 528         case XFS_DINODE_FMT_BTREE:
 529                 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 530                 break;
 531         default:
 532                 error = XFS_ERROR(EFSCORRUPTED);
 533                 break;
 534         }
 535         if (error) {
 536                 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 537                 ip->i_afp = NULL;
 538                 xfs_idestroy_fork(ip, XFS_DATA_FORK);
 539         }
 540         return error;
 541 }
 542
 543 /*
 544  * The file is in-lined in the on-disk inode.
 545  * If it fits into if_inline_data, then copy
 546  * it there, otherwise allocate a buffer for it
 547  * and copy the data there.  Either way, set
 548  * if_data to point at the data.
 549  * If we allocate a buffer for the data, make
 550  * sure that its size is a multiple of 4 and
 551  * record the real size in i_real_bytes.
 552  */
 553 STATIC int
 554 xfs_iformat_local(
 555         xfs_inode_t     *ip,
 556         xfs_dinode_t    *dip,
 557         int             whichfork,
 558         int             size)
 559 {
 560         xfs_ifork_t     *ifp;
 561         int             real_size;
 562
 563         /*
 564          * If the size is unreasonable, then something
 565          * is wrong and we just bail out rather than crash in
 566          * kmem_alloc() or memcpy() below.
 567          */
 568         if (unlikely(size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT))) {
 569                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 570                         "corrupt inode %Lu (bad size %d for local fork, size = %d).  Unmount and run xfs_repair.",
 571                         (unsigned long long) ip->i_ino, size,
 572                         XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT));
 573                 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 574                                      ip->i_mount, dip);
 575                 return XFS_ERROR(EFSCORRUPTED);
 576         }
 577         ifp = XFS_IFORK_PTR(ip, whichfork);
 578         real_size = 0;
 579         if (size == 0)
 580                 ifp->if_u1.if_data = NULL;
 581         else if (size <= sizeof(ifp->if_u2.if_inline_data))
 582                 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 583         else {
 584                 real_size = roundup(size, 4);
 585                 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
 586         }
 587         ifp->if_bytes = size;
 588         ifp->if_real_bytes = real_size;
 589         if (size)
 590                 memcpy(ifp->if_u1.if_data,
 591                         XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT), size);
 592         ifp->if_flags &= ~XFS_IFEXTENTS;
 593         ifp->if_flags |= XFS_IFINLINE;
 594         return 0;
 595 }
 596
 597 /*
 598  * The file consists of a set of extents all
 599  * of which fit into the on-disk inode.
 600  * If there are few enough extents to fit into
 601  * the if_inline_ext, then copy them there.
 602  * Otherwise allocate a buffer for them and copy
 603  * them into it.  Either way, set if_extents
 604  * to point at the extents.
 605  */
 606 STATIC int
 607 xfs_iformat_extents(
 608         xfs_inode_t     *ip,
 609         xfs_dinode_t    *dip,
 610         int             whichfork)
 611 {
 612         xfs_bmbt_rec_t  *ep, *dp;
 613         xfs_ifork_t     *ifp;
 614         int             nex;
 615         int             real_size;
 616         int             size;
 617         int             i;
 618
 619         ifp = XFS_IFORK_PTR(ip, whichfork);
 620         nex = XFS_DFORK_NEXTENTS_ARCH(dip, whichfork, ARCH_CONVERT);
 621         size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 622
 623         /*
 624          * If the number of extents is unreasonable, then something
 625          * is wrong and we just bail out rather than crash in
 626          * kmem_alloc() or memcpy() below.
 627          */
 628         if (unlikely(size < 0 || size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT))) {
 629                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 630                         "corrupt inode %Lu ((a)extents = %d).  Unmount and run xfs_repair.",
 631                         (unsigned long long) ip->i_ino, nex);
 632                 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 633                                      ip->i_mount, dip);
 634                 return XFS_ERROR(EFSCORRUPTED);
 635         }
 636
 637         real_size = 0;
 638         if (nex == 0)
 639                 ifp->if_u1.if_extents = NULL;
 640         else if (nex <= XFS_INLINE_EXTS)
 641                 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 642         else {
 643                 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
 644                 ASSERT(ifp->if_u1.if_extents != NULL);
 645                 real_size = size;
 646         }
 647         ifp->if_bytes = size;
 648         ifp->if_real_bytes = real_size;
 649         if (size) {
 650                 dp = (xfs_bmbt_rec_t *)
 651                         XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
 652                 xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip));
 653                 ep = ifp->if_u1.if_extents;
 654                 for (i = 0; i < nex; i++, ep++, dp++) {
 655                         ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0),
 656                                                                 ARCH_CONVERT);
 657                         ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1),
 658                                                                 ARCH_CONVERT);
 659                 }
 660                 xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
 661                         whichfork);
 662                 if (whichfork != XFS_DATA_FORK ||
 663                         XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 664                                 if (unlikely(xfs_check_nostate_extents(
 665                                     ifp->if_u1.if_extents, nex))) {
 666                                         XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 667                                                          XFS_ERRLEVEL_LOW,
 668                                                          ip->i_mount);
 669                                         return XFS_ERROR(EFSCORRUPTED);
 670                                 }
 671         }
 672         ifp->if_flags |= XFS_IFEXTENTS;
 673         return 0;
 674 }
 675
 676 /*
 677  * The file has too many extents to fit into
 678  * the inode, so they are in B-tree format.
 679  * Allocate a buffer for the root of the B-tree
 680  * and copy the root into it.  The i_extents
 681  * field will remain NULL until all of the
 682  * extents are read in (when they are needed).
 683  */
 684 STATIC int
 685 xfs_iformat_btree(
 686         xfs_inode_t             *ip,
 687         xfs_dinode_t            *dip,
 688         int                     whichfork)
 689 {
 690         xfs_bmdr_block_t        *dfp;
 691         xfs_ifork_t             *ifp;
 692         /* REFERENCED */
 693         int                     nrecs;
 694         int                     size;
 695
 696         ifp = XFS_IFORK_PTR(ip, whichfork);
 697         dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
 698         size = XFS_BMAP_BROOT_SPACE(dfp);
 699         nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
 700
 701         /*
 702          * blow out if -- fork has less extents than can fit in
 703          * fork (fork shouldn't be a btree format), root btree
 704          * block has more records than can fit into the fork,
 705          * or the number of extents is greater than the number of
 706          * blocks.
 707          */
 708         if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
 709             || XFS_BMDR_SPACE_CALC(nrecs) >
 710                         XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)
 711             || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 712                 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 713                         "corrupt inode %Lu (btree).  Unmount and run xfs_repair.",
 714                         (unsigned long long) ip->i_ino);
 715                 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 716                                  ip->i_mount);
 717                 return XFS_ERROR(EFSCORRUPTED);
 718         }
 719
 720         ifp->if_broot_bytes = size;
 721         ifp->if_broot = kmem_alloc(size, KM_SLEEP);
 722         ASSERT(ifp->if_broot != NULL);
 723         /*
 724          * Copy and convert from the on-disk structure
 725          * to the in-memory structure.
 726          */
 727         xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT),
 728                 ifp->if_broot, size);
 729         ifp->if_flags &= ~XFS_IFEXTENTS;
 730         ifp->if_flags |= XFS_IFBROOT;
 731
 732         return 0;
 733 }
 734
 735 /*
 736  * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
 737  * and native format
 738  *
 739  * buf  = on-disk representation
 740  * dip  = native representation
 741  * dir  = direction - +ve -> disk to native
 742  *                    -ve -> native to disk
 743  * arch = on-disk architecture
 744  */
 745 void
 746 xfs_xlate_dinode_core(
 747         xfs_caddr_t             buf,
 748         xfs_dinode_core_t       *dip,
 749         int                     dir,
 750         xfs_arch_t              arch)
 751 {
 752         xfs_dinode_core_t       *buf_core = (xfs_dinode_core_t *)buf;
 753         xfs_dinode_core_t       *mem_core = (xfs_dinode_core_t *)dip;
 754
 755         ASSERT(dir);
 756         if (arch == ARCH_NOCONVERT) {
 757                 if (dir > 0) {
 758                         memcpy((xfs_caddr_t)mem_core, (xfs_caddr_t)buf_core,
 759                                 sizeof(xfs_dinode_core_t));
 760                 } else {
 761                         memcpy((xfs_caddr_t)buf_core, (xfs_caddr_t)mem_core,
 762                                 sizeof(xfs_dinode_core_t));
 763                 }
 764                 return;
 765         }
 766
 767         INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch);
 768         INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch);
 769         INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch);
 770         INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch);
 771         INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch);
 772         INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch);
 773         INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch);
 774         INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch);
 775         INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch);
 776
 777         if (dir > 0) {
 778                 memcpy(mem_core->di_pad, buf_core->di_pad,
 779                         sizeof(buf_core->di_pad));
 780         } else {
 781                 memcpy(buf_core->di_pad, mem_core->di_pad,
 782                         sizeof(buf_core->di_pad));
 783         }
 784
 785         INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch);
 786
 787         INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,
 788                         dir, arch);
 789         INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec,
 790                         dir, arch);
 791         INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,
 792                         dir, arch);
 793         INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec,
 794                         dir, arch);
 795         INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,
 796                         dir, arch);
 797         INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec,
 798                         dir, arch);
 799         INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch);
 800         INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch);
 801         INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch);
 802         INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch);
 803         INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch);
 804         INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch);
 805         INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch);
 806         INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch);
 807         INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch);
 808         INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch);
 809         INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch);
 810 }
 811
 812 /*
 813  * Given a mount structure and an inode number, return a pointer
 814  * to a newly allocated in-core inode coresponding to the given
 815  * inode number.
 816  *
 817  * Initialize the inode's attributes and extent pointers if it
 818  * already has them (it will not if the inode has no links).
 819  */
 820 int
 821 xfs_iread(
 822         xfs_mount_t     *mp,
 823         xfs_trans_t     *tp,
 824         xfs_ino_t       ino,
 825         xfs_inode_t     **ipp,
 826         xfs_daddr_t     bno)
 827 {
 828         xfs_buf_t       *bp;
 829         xfs_dinode_t    *dip;
 830         xfs_inode_t     *ip;
 831         int             error;
 832
 833         ASSERT(xfs_inode_zone != NULL);
 834
 835         ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
 836         ip->i_ino = ino;
 837         ip->i_mount = mp;
 838
 839         /*
 840          * Get pointer's to the on-disk inode and the buffer containing it.
 841          * If the inode number refers to a block outside the file system
 842          * then xfs_itobp() will return NULL.  In this case we should
 843          * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
 844          * know that this is a new incore inode.
 845          */
 846         error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
 847
 848         if (error != 0) {
 849                 kmem_zone_free(xfs_inode_zone, ip);
 850                 return error;
 851         }
 852
 853         /*
 854          * Initialize inode's trace buffers.
 855          * Do this before xfs_iformat in case it adds entries.
 856          */
 857 #ifdef XFS_BMAP_TRACE
 858         ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
 859 #endif
 860 #ifdef XFS_BMBT_TRACE
 861         ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
 862 #endif
 863 #ifdef XFS_RW_TRACE
 864         ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
 865 #endif
 866 #ifdef XFS_ILOCK_TRACE
 867         ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
 868 #endif
 869 #ifdef XFS_DIR2_TRACE
 870         ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
 871 #endif
 872
 873         /*
 874          * If we got something that isn't an inode it means someone
 875          * (nfs or dmi) has a stale handle.
 876          */
 877         if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
 878                 kmem_zone_free(xfs_inode_zone, ip);
 879                 xfs_trans_brelse(tp, bp);
 880 #ifdef DEBUG
 881                 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
 882                                 "dip->di_core.di_magic (0x%x) != "
 883                                 "XFS_DINODE_MAGIC (0x%x)",
 884                                 INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
 885                                 XFS_DINODE_MAGIC);
 886 #endif /* DEBUG */
 887                 return XFS_ERROR(EINVAL);
 888         }
 889
 890         /*
 891          * If the on-disk inode is already linked to a directory
 892          * entry, copy all of the inode into the in-core inode.
 893          * xfs_iformat() handles copying in the inode format
 894          * specific information.
 895          * Otherwise, just get the truly permanent information.
 896          */
 897         if (!INT_ISZERO(dip->di_core.di_mode, ARCH_CONVERT)) {
 898                 xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
 899                      &(ip->i_d), 1, ARCH_CONVERT);
 900                 error = xfs_iformat(ip, dip);
 901                 if (error)  {
 902                         kmem_zone_free(xfs_inode_zone, ip);
 903                         xfs_trans_brelse(tp, bp);
 904 #ifdef DEBUG
 905                         xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
 906                                         "xfs_iformat() returned error %d",
 907                                         error);
 908 #endif /* DEBUG */
 909                         return error;
 910                 }
 911         } else {
 912                 ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
 913                 ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
 914                 ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
 915                 ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT);
 916                 /*
 917                  * Make sure to pull in the mode here as well in
 918                  * case the inode is released without being used.
 919                  * This ensures that xfs_inactive() will see that
 920                  * the inode is already free and not try to mess
 921                  * with the uninitialized part of it.
 922                  */
 923                 ip->i_d.di_mode = 0;
 924                 /*
 925                  * Initialize the per-fork minima and maxima for a new
 926                  * inode here.  xfs_iformat will do it for old inodes.
 927                  */
 928                 ip->i_df.if_ext_max =
 929                         XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 930         }
 931
 932         /* XXXKAN: initialize i_reclaim */
 933         bzero(&ip->i_reclaim, sizeof(&ip->i_reclaim));
 934
 935         /*
 936          * The inode format changed when we moved the link count and
 937          * made it 32 bits long.  If this is an old format inode,
 938          * convert it in memory to look like a new one.  If it gets
 939          * flushed to disk we will convert back before flushing or
 940          * logging it.  We zero out the new projid field and the old link
 941          * count field.  We'll handle clearing the pad field (the remains
 942          * of the old uuid field) when we actually convert the inode to
 943          * the new format. We don't change the version number so that we
 944          * can distinguish this from a real new format inode.
 945          */
 946         if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
 947                 ip->i_d.di_nlink = ip->i_d.di_onlink;
 948                 ip->i_d.di_onlink = 0;
 949                 ip->i_d.di_projid = 0;
 950         }
 951
 952         ip->i_delayed_blks = 0;
 953
 954         /*
 955          * Mark the buffer containing the inode as something to keep
 956          * around for a while.  This helps to keep recently accessed
 957          * meta-data in-core longer.
 958          */
 959          XFS_BUF_SET_REF(bp, XFS_INO_REF);
 960
 961         /*
 962          * Use xfs_trans_brelse() to release the buffer containing the
 963          * on-disk inode, because it was acquired with xfs_trans_read_buf()
 964          * in xfs_itobp() above.  If tp is NULL, this is just a normal
 965          * brelse().  If we're within a transaction, then xfs_trans_brelse()
 966          * will only release the buffer if it is not dirty within the
 967          * transaction.  It will be OK to release the buffer in this case,
 968          * because inodes on disk are never destroyed and we will be
 969          * locking the new in-core inode before putting it in the hash
 970          * table where other processes can find it.  Thus we don't have
 971          * to worry about the inode being changed just because we released
 972          * the buffer.
 973          */
 974         xfs_trans_brelse(tp, bp);
 975         *ipp = ip;
 976         return 0;
 977 }
 978
 979 /*
 980  * Read in extents from a btree-format inode.
 981  * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
 982  */
 983 int
 984 xfs_iread_extents(
 985         xfs_trans_t     *tp,
 986         xfs_inode_t     *ip,
 987         int             whichfork)
 988 {
 989         int             error;
 990         xfs_ifork_t     *ifp;
 991         size_t          size;
 992
 993         if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 994                 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 995                                  ip->i_mount);
 996                 return XFS_ERROR(EFSCORRUPTED);
 997         }
 998         size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
 999         ifp = XFS_IFORK_PTR(ip, whichfork);
1000         /*
1001          * We know that the size is valid (it's checked in iformat_btree)
1002          */
1003         ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
1004         ASSERT(ifp->if_u1.if_extents != NULL);
1005         ifp->if_lastex = NULLEXTNUM;
1006         ifp->if_bytes = ifp->if_real_bytes = (int)size;
1007         ifp->if_flags |= XFS_IFEXTENTS;
1008         error = xfs_bmap_read_extents(tp, ip, whichfork);
1009         if (error) {
1010                 kmem_free(ifp->if_u1.if_extents, size);
1011                 ifp->if_u1.if_extents = NULL;
1012                 ifp->if_bytes = ifp->if_real_bytes = 0;
1013                 ifp->if_flags &= ~XFS_IFEXTENTS;
1014                 return error;
1015         }
1016         xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents,
1017                 XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip));
1018         return 0;
1019 }
1020
1021 /*
1022  * Allocate an inode on disk and return a copy of its in-core version.
1023  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1024  * appropriately within the inode.  The uid and gid for the inode are
1025  * set according to the contents of the given cred structure.
1026  *
1027  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1028  * has a free inode available, call xfs_iget()
1029  * to obtain the in-core version of the allocated inode.  Finally,
1030  * fill in the inode and log its initial contents.  In this case,
1031  * ialloc_context would be set to NULL and call_again set to false.
1032  *
1033  * If xfs_dialloc() does not have an available inode,
1034  * it will replenish its supply by doing an allocation. Since we can
1035  * only do one allocation within a transaction without deadlocks, we
1036  * must commit the current transaction before returning the inode itself.
1037  * In this case, therefore, we will set call_again to true and return.
1038  * The caller should then commit the current transaction, start a new
1039  * transaction, and call xfs_ialloc() again to actually get the inode.
1040  *
1041  * To ensure that some other process does not grab the inode that
1042  * was allocated during the first call to xfs_ialloc(), this routine
1043  * also returns the [locked] bp pointing to the head of the freelist
1044  * as ialloc_context.  The caller should hold this buffer across
1045  * the commit and pass it back into this routine on the second call.
1046  */
1047 int
1048 xfs_ialloc(
1049         xfs_trans_t     *tp,
1050         xfs_inode_t     *pip,
1051         mode_t          mode,
1052         nlink_t         nlink,
1053         xfs_dev_t       rdev,
1054         cred_t          *cr,
1055         xfs_prid_t      prid,
1056         int             okalloc,
1057         xfs_buf_t       **ialloc_context,
1058         boolean_t       *call_again,
1059         xfs_inode_t     **ipp)
1060 {
1061         xfs_ino_t       ino;
1062         xfs_inode_t     *ip;
1063         xfs_vnode_t     *vp;
1064         uint            flags;
1065         int             error;
1066
1067         /*
1068          * Call the space management code to pick
1069          * the on-disk inode to be allocated.
1070          */
1071         ASSERT(pip != NULL);
1072         error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1073                             ialloc_context, call_again, &ino);
1074         if (error != 0) {
1075                 return error;
1076         }
1077         if (*call_again || ino == NULLFSINO) {
1078                 *ipp = NULL;
1079                 return 0;
1080         }
1081         ASSERT(*ialloc_context == NULL);
1082
1083         /*
1084          * Get the in-core inode with the lock held exclusively.
1085          * This is because we're setting fields here we need
1086          * to prevent others from looking at until we're done.
1087          */
1088         error = xfs_trans_iget(tp->t_mountp, tp, ino, XFS_ILOCK_EXCL, &ip);
1089         if (error != 0) {
1090                 return error;
1091         }
1092         ASSERT(ip != NULL);
1093         vp = XFS_ITOV(ip);
1094         ASSERT(vp != NULL);
1095         vp->v_type = IFTOVT(mode);
1096         ip->i_d.di_mode = (__uint16_t)mode;
1097         ip->i_d.di_onlink = 0;
1098         ip->i_d.di_nlink = nlink;
1099         ASSERT(ip->i_d.di_nlink == nlink);
1100         ip->i_d.di_uid = curthread->td_ucred->cr_uid;
1101         ip->i_d.di_gid = curthread->td_ucred->cr_groups[0];
1102         ip->i_d.di_projid = prid;
1103         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1104
1105         /*
1106          * If the superblock version is up to where we support new format
1107          * inodes and this is currently an old format inode, then change
1108          * the inode version number now.  This way we only do the conversion
1109          * here rather than here and in the flush/logging code.
1110          */
1111         if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
1112             ip->i_d.di_version == XFS_DINODE_VERSION_1) {
1113                 ip->i_d.di_version = XFS_DINODE_VERSION_2;
1114                 /*
1115                  * We've already zeroed the old link count, the projid field,
1116                  * and the pad field.
1117                  */
1118         }
1119
1120         /*
1121          * Project ids won't be stored on disk if we are using a version 1 inode.
1122          */
1123         if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1124                 xfs_bump_ino_vers2(tp, ip);
1125
1126         if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
1127                 ip->i_d.di_gid = pip->i_d.di_gid;
1128                 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
1129                         ip->i_d.di_mode |= S_ISGID;
1130                 }
1131         }
1132
1133         /*
1134          * If the group ID of the new file does not match the effective group
1135          * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1136          * (and only if the irix_sgid_inherit compatibility variable is set).
1137          */
1138         if ((irix_sgid_inherit) &&
1139             (ip->i_d.di_mode & S_ISGID) &&
1140             (!groupmember((gid_t)ip->i_d.di_gid, curthread->td_ucred))) {
1141                 ip->i_d.di_mode &= ~S_ISGID;
1142         }
1143
1144         ip->i_d.di_size = 0;
1145         ip->i_d.di_nextents = 0;
1146         ASSERT(ip->i_d.di_nblocks == 0);
1147         xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
1148         /*
1149          * di_gen will have been taken care of in xfs_iread.
1150          */
1151         ip->i_d.di_extsize = 0;
1152         ip->i_d.di_dmevmask = 0;
1153         ip->i_d.di_dmstate = 0;
1154         ip->i_d.di_flags = 0;
1155         flags = XFS_ILOG_CORE;
1156         switch (mode & S_IFMT) {
1157         case S_IFIFO:
1158         case S_IFCHR:
1159         case S_IFBLK:
1160         case S_IFSOCK:
1161                 ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1162                 ip->i_df.if_u2.if_rdev = rdev;
1163                 ip->i_df.if_flags = 0;
1164                 flags |= XFS_ILOG_DEV;
1165                 break;
1166         case S_IFREG:
1167         case S_IFDIR:
1168                 if (pip->i_d.di_flags &
1169                     (XFS_DIFLAG_NOATIME|XFS_DIFLAG_NODUMP|XFS_DIFLAG_SYNC)) {
1170                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1171                             xfs_inherit_noatime)
1172                                 ip->i_d.di_flags |= XFS_DIFLAG_NOATIME;
1173                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1174                             xfs_inherit_nodump)
1175                                 ip->i_d.di_flags |= XFS_DIFLAG_NODUMP;
1176                         if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1177                             xfs_inherit_sync)
1178                                 ip->i_d.di_flags |= XFS_DIFLAG_SYNC;
1179                 }
1180         case S_IFLNK:
1181                 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1182                 ip->i_df.if_flags = XFS_IFEXTENTS;
1183                 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1184                 ip->i_df.if_u1.if_extents = NULL;
1185                 break;
1186         default:
1187                 ASSERT(0);
1188         }
1189         /*
1190          * Attribute fork settings for new inode.
1191          */
1192         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1193         ip->i_d.di_anextents = 0;
1194
1195         /*
1196          * Log the new values stuffed into the inode.
1197          */
1198         xfs_trans_log_inode(tp, ip, flags);
1199
1200         /* now that we have a v_type we can set Linux inode ops (& unlock) */
1201         XVFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
1202
1203         *ipp = ip;
1204         return 0;
1205 }
1206
1207 /*
1208  * Check to make sure that there are no blocks allocated to the
1209  * file beyond the size of the file.  We don't check this for
1210  * files with fixed size extents or real time extents, but we
1211  * at least do it for regular files.
1212  */
1213 #ifdef DEBUG
1214 void
1215 xfs_isize_check(
1216         xfs_mount_t     *mp,
1217         xfs_inode_t     *ip,
1218         xfs_fsize_t     isize)
1219 {
1220         xfs_fileoff_t   map_first;
1221         int             nimaps;
1222         xfs_bmbt_irec_t imaps[2];
1223
1224         if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1225                 return;
1226
1227         if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
1228                 return;
1229
1230         nimaps = 2;
1231         map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1232         /*
1233          * The filesystem could be shutting down, so bmapi may return
1234          * an error.
1235          */
1236         if (xfs_bmapi(NULL, ip, map_first,
1237                          (XFS_B_TO_FSB(mp,
1238                                        (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1239                           map_first),
1240                          XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1241                          NULL))
1242             return;
1243         ASSERT(nimaps == 1);
1244         ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1245 }
1246 #endif  /* DEBUG */
1247
1248 /*
1249  * Calculate the last possible buffered byte in a file.  This must
1250  * include data that was buffered beyond the EOF by the write code.
1251  * This also needs to deal with overflowing the xfs_fsize_t type
1252  * which can happen for sizes near the limit.
1253  *
1254  * We also need to take into account any blocks beyond the EOF.  It
1255  * may be the case that they were buffered by a write which failed.
1256  * In that case the pages will still be in memory, but the inode size
1257  * will never have been updated.
1258  */
1259 xfs_fsize_t
1260 xfs_file_last_byte(
1261         xfs_inode_t     *ip)
1262 {
1263         xfs_mount_t     *mp;
1264         xfs_fsize_t     last_byte;
1265         xfs_fileoff_t   last_block;
1266         xfs_fileoff_t   size_last_block;
1267         int             error;
1268
1269         ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
1270
1271         mp = ip->i_mount;
1272         /*
1273          * Only check for blocks beyond the EOF if the extents have
1274          * been read in.  This eliminates the need for the inode lock,
1275          * and it also saves us from looking when it really isn't
1276          * necessary.
1277          */
1278         if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1279                 error = xfs_bmap_last_offset(NULL, ip, &last_block,
1280                         XFS_DATA_FORK);
1281                 if (error) {
1282                         last_block = 0;
1283                 }
1284         } else {
1285                 last_block = 0;
1286         }
1287         size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
1288         last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1289
1290         last_byte = XFS_FSB_TO_B(mp, last_block);
1291         if (last_byte < 0) {
1292                 return XFS_MAXIOFFSET(mp);
1293         }
1294         last_byte += (1 << mp->m_writeio_log);
1295         if (last_byte < 0) {
1296                 return XFS_MAXIOFFSET(mp);
1297         }
1298         return last_byte;
1299 }
1300
1301 #if defined(XFS_RW_TRACE)
1302 STATIC void
1303 xfs_itrunc_trace(
1304         int             tag,
1305         xfs_inode_t     *ip,
1306         int             flag,
1307         xfs_fsize_t     new_size,
1308         xfs_off_t       toss_start,
1309         xfs_off_t       toss_finish)
1310 {
1311         if (ip->i_rwtrace == NULL) {
1312                 return;
1313         }
1314
1315         ktrace_enter(ip->i_rwtrace,
1316                      (void*)((long)tag),
1317                      (void*)ip,
1318                      (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
1319                      (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
1320                      (void*)((long)flag),
1321                      (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
1322                      (void*)(unsigned long)(new_size & 0xffffffff),
1323                      (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
1324                      (void*)(unsigned long)(toss_start & 0xffffffff),
1325                      (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
1326                      (void*)(unsigned long)(toss_finish & 0xffffffff),
1327                      (void*)(unsigned long)current_cpu(),
1328                      (void*)0,
1329                      (void*)0,
1330                      (void*)0,
1331                      (void*)0);
1332 }
1333 #else
1334 #define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
1335 #endif
1336
1337 /*
1338  * Start the truncation of the file to new_size.  The new size
1339  * must be smaller than the current size.  This routine will
1340  * clear the buffer and page caches of file data in the removed
1341  * range, and xfs_itruncate_finish() will remove the underlying
1342  * disk blocks.
1343  *
1344  * The inode must have its I/O lock locked EXCLUSIVELY, and it
1345  * must NOT have the inode lock held at all.  This is because we're
1346  * calling into the buffer/page cache code and we can't hold the
1347  * inode lock when we do so.
1348  *
1349  * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1350  * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
1351  * in the case that the caller is locking things out of order and
1352  * may not be able to call xfs_itruncate_finish() with the inode lock
1353  * held without dropping the I/O lock.  If the caller must drop the
1354  * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
1355  * must be called again with all the same restrictions as the initial
1356  * call.
1357  */
1358 void
1359 xfs_itruncate_start(
1360         xfs_inode_t     *ip,
1361         uint            flags,
1362         xfs_fsize_t     new_size)
1363 {
1364         xfs_fsize_t     last_byte;
1365         xfs_off_t       toss_start;
1366         xfs_mount_t     *mp;
1367         xfs_vnode_t     *vp;
1368
1369         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1370         ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
1371         ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1372                (flags == XFS_ITRUNC_MAYBE));
1373
1374         mp = ip->i_mount;
1375         vp = XFS_ITOV(ip);
1376         /*
1377          * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
1378          * overlapping the region being removed.  We have to use
1379          * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
1380          * caller may not be able to finish the truncate without
1381          * dropping the inode's I/O lock.  Make sure
1382          * to catch any pages brought in by buffers overlapping
1383          * the EOF by searching out beyond the isize by our
1384          * block size. We round new_size up to a block boundary
1385          * so that we don't toss things on the same block as
1386          * new_size but before it.
1387          *
1388          * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
1389          * call remapf() over the same region if the file is mapped.
1390          * This frees up mapped file references to the pages in the
1391          * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
1392          * that we get the latest mapped changes flushed out.
1393          */
1394         toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1395         toss_start = XFS_FSB_TO_B(mp, toss_start);
1396         if (toss_start < 0) {
1397                 /*
1398                  * The place to start tossing is beyond our maximum
1399                  * file size, so there is no way that the data extended
1400                  * out there.
1401                  */
1402                 return;
1403         }
1404         last_byte = xfs_file_last_byte(ip);
1405         xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
1406                          last_byte);
1407         if (last_byte > toss_start) {
1408                 if (flags & XFS_ITRUNC_DEFINITE) {
1409                         XVOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
1410                 } else {
1411                         XVOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
1412                 }
1413         }
1414
1415 #ifdef DEBUG
1416         if (new_size == 0) {
1417                 ASSERT(VN_CACHED(vp) == 0);
1418         }
1419 #endif
1420 }
1421
1422 /*
1423  * Shrink the file to the given new_size.  The new
1424  * size must be smaller than the current size.
1425  * This will free up the underlying blocks
1426  * in the removed range after a call to xfs_itruncate_start()
1427  * or xfs_atruncate_start().
1428  *
1429  * The transaction passed to this routine must have made
1430  * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
1431  * This routine may commit the given transaction and
1432  * start new ones, so make sure everything involved in
1433  * the transaction is tidy before calling here.
1434  * Some transaction will be returned to the caller to be
1435  * committed.  The incoming transaction must already include
1436  * the inode, and both inode locks must be held exclusively.
1437  * The inode must also be "held" within the transaction.  On
1438  * return the inode will be "held" within the returned transaction.
1439  * This routine does NOT require any disk space to be reserved
1440  * for it within the transaction.
1441  *
1442  * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
1443  * and it indicates the fork which is to be truncated.  For the
1444  * attribute fork we only support truncation to size 0.
1445  *
1446  * We use the sync parameter to indicate whether or not the first
1447  * transaction we perform might have to be synchronous.  For the attr fork,
1448  * it needs to be so if the unlink of the inode is not yet known to be
1449  * permanent in the log.  This keeps us from freeing and reusing the
1450  * blocks of the attribute fork before the unlink of the inode becomes
1451  * permanent.
1452  *
1453  * For the data fork, we normally have to run synchronously if we're
1454  * being called out of the inactive path or we're being called
1455  * out of the create path where we're truncating an existing file.
1456  * Either way, the truncate needs to be sync so blocks don't reappear
1457  * in the file with altered data in case of a crash.  wsync filesystems
1458  * can run the first case async because anything that shrinks the inode
1459  * has to run sync so by the time we're called here from inactive, the
1460  * inode size is permanently set to 0.
1461  *
1462  * Calls from the truncate path always need to be sync unless we're
1463  * in a wsync filesystem and the file has already been unlinked.
1464  *
1465  * The caller is responsible for correctly setting the sync parameter.
1466  * It gets too hard for us to guess here which path we're being called
1467  * out of just based on inode state.
1468  */
1469 int
1470 xfs_itruncate_finish(
1471         xfs_trans_t     **tp,
1472         xfs_inode_t     *ip,
1473         xfs_fsize_t     new_size,
1474         int             fork,
1475         int             sync)
1476 {
1477         xfs_fsblock_t   first_block;
1478         xfs_fileoff_t   first_unmap_block;
1479         xfs_fileoff_t   last_block;
1480         xfs_filblks_t   unmap_len=0;
1481         xfs_mount_t     *mp;
1482         xfs_trans_t     *ntp;
1483         int             done;
1484         int             committed;
1485         xfs_bmap_free_t free_list;
1486         int             error;
1487
1488         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1489         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
1490         ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
1491         ASSERT(*tp != NULL);
1492         ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1493         ASSERT(ip->i_transp == *tp);
1494         ASSERT(ip->i_itemp != NULL);
1495         ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
1496
1497
1498         ntp = *tp;
1499         mp = (ntp)->t_mountp;
1500         ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1501
1502         /*
1503          * We only support truncating the entire attribute fork.
1504          */
1505         if (fork == XFS_ATTR_FORK) {
1506                 new_size = 0LL;
1507         }
1508         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1509         xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
1510         /*
1511          * The first thing we do is set the size to new_size permanently
1512          * on disk.  This way we don't have to worry about anyone ever
1513          * being able to look at the data being freed even in the face
1514          * of a crash.  What we're getting around here is the case where
1515          * we free a block, it is allocated to another file, it is written
1516          * to, and then we crash.  If the new data gets written to the
1517          * file but the log buffers containing the free and reallocation
1518          * don't, then we'd end up with garbage in the blocks being freed.
1519          * As long as we make the new_size permanent before actually
1520          * freeing any blocks it doesn't matter if they get writtten to.
1521          *
1522          * The callers must signal into us whether or not the size
1523          * setting here must be synchronous.  There are a few cases
1524          * where it doesn't have to be synchronous.  Those cases
1525          * occur if the file is unlinked and we know the unlink is
1526          * permanent or if the blocks being truncated are guaranteed
1527          * to be beyond the inode eof (regardless of the link count)
1528          * and the eof value is permanent.  Both of these cases occur
1529          * only on wsync-mounted filesystems.  In those cases, we're
1530          * guaranteed that no user will ever see the data in the blocks
1531          * that are being truncated so the truncate can run async.
1532          * In the free beyond eof case, the file may wind up with
1533          * more blocks allocated to it than it needs if we crash
1534          * and that won't get fixed until the next time the file
1535          * is re-opened and closed but that's ok as that shouldn't
1536          * be too many blocks.
1537          *
1538          * However, we can't just make all wsync xactions run async
1539          * because there's one call out of the create path that needs
1540          * to run sync where it's truncating an existing file to size
1541          * 0 whose size is > 0.
1542          *
1543          * It's probably possible to come up with a test in this
1544          * routine that would correctly distinguish all the above
1545          * cases from the values of the function parameters and the
1546          * inode state but for sanity's sake, I've decided to let the
1547          * layers above just tell us.  It's simpler to correctly figure
1548          * out in the layer above exactly under what conditions we
1549          * can run async and I think it's easier for others read and
1550          * follow the logic in case something has to be changed.
1551          * cscope is your friend -- rcc.
1552          *
1553          * The attribute fork is much simpler.
1554          *
1555          * For the attribute fork we allow the caller to tell us whether
1556          * the unlink of the inode that led to this call is yet permanent
1557          * in the on disk log.  If it is not and we will be freeing extents
1558          * in this inode then we make the first transaction synchronous
1559          * to make sure that the unlink is permanent by the time we free
1560          * the blocks.
1561          */
1562         if (fork == XFS_DATA_FORK) {
1563                 if (ip->i_d.di_nextents > 0) {
1564                         ip->i_d.di_size = new_size;
1565                         xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1566                 }
1567         } else if (sync) {
1568                 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1569                 if (ip->i_d.di_anextents > 0)
1570                         xfs_trans_set_sync(ntp);
1571         }
1572         ASSERT(fork == XFS_DATA_FORK ||
1573                 (fork == XFS_ATTR_FORK &&
1574                         ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1575                          (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1576
1577         /*
1578          * Since it is possible for space to become allocated beyond
1579          * the end of the file (in a crash where the space is allocated
1580          * but the inode size is not yet updated), simply remove any
1581          * blocks which show up between the new EOF and the maximum
1582          * possible file size.  If the first block to be removed is
1583          * beyond the maximum file size (ie it is the same as last_block),
1584          * then there is nothing to do.
1585          */
1586         last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1587         ASSERT(first_unmap_block <= last_block);
1588         done = 0;
1589         if (last_block == first_unmap_block) {
1590                 done = 1;
1591         } else {
1592                 unmap_len = last_block - first_unmap_block + 1;
1593         }
1594         while (!done) {
1595                 /*
1596                  * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
1597                  * will tell us whether it freed the entire range or
1598                  * not.  If this is a synchronous mount (wsync),
1599                  * then we can tell bunmapi to keep all the
1600                  * transactions asynchronous since the unlink
1601                  * transaction that made this inode inactive has
1602                  * already hit the disk.  There's no danger of
1603                  * the freed blocks being reused, there being a
1604                  * crash, and the reused blocks suddenly reappearing
1605                  * in this file with garbage in them once recovery
1606                  * runs.
1607                  */
1608                 XFS_BMAP_INIT(&free_list, &first_block);
1609                 error = xfs_bunmapi(ntp, ip, first_unmap_block,
1610                                     unmap_len,
1611                                     XFS_BMAPI_AFLAG(fork) |
1612                                       (sync ? 0 : XFS_BMAPI_ASYNC),
1613                                     XFS_ITRUNC_MAX_EXTENTS,
1614                                     &first_block, &free_list, &done);
1615                 if (error) {
1616                         /*
1617                          * If the bunmapi call encounters an error,
1618                          * return to the caller where the transaction
1619                          * can be properly aborted.  We just need to
1620                          * make sure we're not holding any resources
1621                          * that we were not when we came in.
1622                          */
1623                         xfs_bmap_cancel(&free_list);
1624                         return error;
1625                 }
1626
1627                 /*
1628                  * Duplicate the transaction that has the permanent
1629                  * reservation and commit the old transaction.
1630                  */
1631                 error = xfs_bmap_finish(tp, &free_list, first_block,
1632                                         &committed);
1633                 ntp = *tp;
1634                 if (error) {
1635                         /*
1636                          * If the bmap finish call encounters an error,
1637                          * return to the caller where the transaction
1638                          * can be properly aborted.  We just need to
1639                          * make sure we're not holding any resources
1640                          * that we were not when we came in.
1641                          *
1642                          * Aborting from this point might lose some
1643                          * blocks in the file system, but oh well.
1644                          */
1645                         xfs_bmap_cancel(&free_list);
1646                         if (committed) {
1647                                 /*
1648                                  * If the passed in transaction committed
1649                                  * in xfs_bmap_finish(), then we want to
1650                                  * add the inode to this one before returning.
1651                                  * This keeps things simple for the higher
1652                                  * level code, because it always knows that
1653                                  * the inode is locked and held in the
1654                                  * transaction that returns to it whether
1655                                  * errors occur or not.  We don't mark the
1656                                  * inode dirty so that this transaction can
1657                                  * be easily aborted if possible.
1658                                  */
1659                                 xfs_trans_ijoin(ntp, ip,
1660                                         XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1661                                 xfs_trans_ihold(ntp, ip);
1662                         }
1663                         return error;
1664                 }
1665
1666                 if (committed) {
1667                         /*
1668                          * The first xact was committed,
1669                          * so add the inode to the new one.
1670                          * Mark it dirty so it will be logged
1671                          * and moved forward in the log as
1672                          * part of every commit.
1673                          */
1674                         xfs_trans_ijoin(ntp, ip,
1675                                         XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1676                         xfs_trans_ihold(ntp, ip);
1677                         xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1678                 }
1679                 ntp = xfs_trans_dup(ntp);
1680                 (void) xfs_trans_commit(*tp, 0, NULL);
1681                 *tp = ntp;
1682                 error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1683                                           XFS_TRANS_PERM_LOG_RES,
1684                                           XFS_ITRUNCATE_LOG_COUNT);
1685                 /*
1686                  * Add the inode being truncated to the next chained
1687                  * transaction.
1688                  */
1689                 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1690                 xfs_trans_ihold(ntp, ip);
1691                 if (error)
1692                         return (error);
1693         }
1694         /*
1695          * Only update the size in the case of the data fork, but
1696          * always re-log the inode so that our permanent transaction
1697          * can keep on rolling it forward in the log.
1698          */
1699         if (fork == XFS_DATA_FORK) {
1700                 xfs_isize_check(mp, ip, new_size);
1701                 ip->i_d.di_size = new_size;
1702         }
1703         xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1704         ASSERT((new_size != 0) ||
1705                (fork == XFS_ATTR_FORK) ||
1706                (ip->i_delayed_blks == 0));
1707         ASSERT((new_size != 0) ||
1708                (fork == XFS_ATTR_FORK) ||
1709                (ip->i_d.di_nextents == 0));
1710         xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
1711         return 0;
1712 }
1713
1714
1715 /*
1716  * xfs_igrow_start
1717  *
1718  * Do the first part of growing a file: zero any data in the last
1719  * block that is beyond the old EOF.  We need to do this before
1720  * the inode is joined to the transaction to modify the i_size.
1721  * That way we can drop the inode lock and call into the buffer
1722  * cache to get the buffer mapping the EOF.
1723  */
1724 int
1725 xfs_igrow_start(
1726         xfs_inode_t     *ip,
1727         xfs_fsize_t     new_size,
1728         cred_t          *credp)
1729 {
1730         xfs_fsize_t     isize;
1731         int             error;
1732
1733         ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1734         ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1735         ASSERT(new_size > ip->i_d.di_size);
1736
1737         error = 0;
1738         isize = ip->i_d.di_size;
1739         /*
1740          * Zero any pages that may have been created by
1741          * xfs_write_file() beyond the end of the file
1742          * and any blocks between the old and new file sizes.
1743          */
1744         error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
1745                                 new_size);
1746         return error;
1747 }
1748
1749 /*
1750  * xfs_igrow_finish
1751  *
1752  * This routine is called to extend the size of a file.
1753  * The inode must have both the iolock and the ilock locked
1754  * for update and it must be a part of the current transaction.
1755  * The xfs_igrow_start() function must have been called previously.
1756  * If the change_flag is not zero, the inode change timestamp will
1757  * be updated.
1758  */
1759 void
1760 xfs_igrow_finish(
1761         xfs_trans_t     *tp,
1762         xfs_inode_t     *ip,
1763         xfs_fsize_t     new_size,
1764         int             change_flag)
1765 {
1766         ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1767         ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1768         ASSERT(ip->i_transp == tp);
1769         ASSERT(new_size > ip->i_d.di_size);
1770
1771         /*
1772          * Update the file size.  Update the inode change timestamp
1773          * if change_flag set.
1774          */
1775         ip->i_d.di_size = new_size;
1776         if (change_flag)
1777                 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1778         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1779
1780 }
1781
1782
1783 /*
1784  * This is called when the inode's link count goes to 0.
1785  * We place the on-disk inode on a list in the AGI.  It
1786  * will be pulled from this list when the inode is freed.
1787  */
1788 int
1789 xfs_iunlink(
1790         xfs_trans_t     *tp,
1791         xfs_inode_t     *ip)
1792 {
1793         xfs_mount_t     *mp;
1794         xfs_agi_t       *agi;
1795         xfs_dinode_t    *dip;
1796         xfs_buf_t       *agibp;
1797         xfs_buf_t       *ibp;
1798         xfs_agnumber_t  agno;
1799         xfs_daddr_t     agdaddr;
1800         xfs_agino_t     agino;
1801         short           bucket_index;
1802         int             offset;
1803         int             error;
1804         int             agi_ok;
1805
1806         ASSERT(ip->i_d.di_nlink == 0);
1807         ASSERT(ip->i_d.di_mode != 0);
1808         ASSERT(ip->i_transp == tp);
1809
1810         mp = tp->t_mountp;
1811
1812         agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1813         agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1814
1815         /*
1816          * Get the agi buffer first.  It ensures lock ordering
1817          * on the list.
1818          */
1819         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1820                                    XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1821         if (error) {
1822                 return error;
1823         }
1824         /*
1825          * Validate the magic number of the agi block.
1826          */
1827         agi = XFS_BUF_TO_AGI(agibp);
1828         agi_ok =
1829                 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
1830                 XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
1831         if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
1832                         XFS_RANDOM_IUNLINK))) {
1833                 XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
1834                 xfs_trans_brelse(tp, agibp);
1835                 return XFS_ERROR(EFSCORRUPTED);
1836         }
1837         /*
1838          * Get the index into the agi hash table for the
1839          * list this inode will go on.
1840          */
1841         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1842         ASSERT(agino != 0);
1843         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1844         ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT));
1845         ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino);
1846
1847         if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) {
1848                 /*
1849                  * There is already another inode in the bucket we need
1850                  * to add ourselves to.  Add us at the front of the list.
1851                  * Here we put the head pointer into our next pointer,
1852                  * and then we fall through to point the head at us.
1853                  */
1854                 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1855                 if (error) {
1856                         return error;
1857                 }
1858                 ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
1859                 ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT));
1860                 /* both on-disk, don't endian flip twice */
1861                 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1862                 offset = ip->i_boffset +
1863                         offsetof(xfs_dinode_t, di_next_unlinked);
1864                 xfs_trans_inode_buf(tp, ibp);
1865                 xfs_trans_log_buf(tp, ibp, offset,
1866                                   (offset + sizeof(xfs_agino_t) - 1));
1867                 xfs_inobp_check(mp, ibp);
1868         }
1869
1870         /*
1871          * Point the bucket head pointer at the inode being inserted.
1872          */
1873         ASSERT(agino != 0);
1874         INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino);
1875         offset = offsetof(xfs_agi_t, agi_unlinked) +
1876                 (sizeof(xfs_agino_t) * bucket_index);
1877         xfs_trans_log_buf(tp, agibp, offset,
1878                           (offset + sizeof(xfs_agino_t) - 1));
1879         return 0;
1880 }
1881
1882 /*
1883  * Pull the on-disk inode from the AGI unlinked list.
1884  */
1885 STATIC int
1886 xfs_iunlink_remove(
1887         xfs_trans_t     *tp,
1888         xfs_inode_t     *ip)
1889 {
1890         xfs_ino_t       next_ino;
1891         xfs_mount_t     *mp;
1892         xfs_agi_t       *agi;
1893         xfs_dinode_t    *dip;
1894         xfs_buf_t       *agibp;
1895         xfs_buf_t       *ibp;
1896         xfs_agnumber_t  agno;
1897         xfs_daddr_t     agdaddr;
1898         xfs_agino_t     agino;
1899         xfs_agino_t     next_agino;
1900         xfs_buf_t       *last_ibp;
1901         xfs_dinode_t    *last_dip;
1902         short           bucket_index;
1903         int             offset, last_offset;
1904         int             error;
1905         int             agi_ok;
1906
1907         /*
1908          * First pull the on-disk inode from the AGI unlinked list.
1909          */
1910         mp = tp->t_mountp;
1911
1912         agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1913         agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1914
1915         /*
1916          * Get the agi buffer first.  It ensures lock ordering
1917          * on the list.
1918          */
1919         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1920                                    XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1921         if (error) {
1922                 cmn_err(CE_WARN,
1923                         "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
1924                         error, mp->m_fsname);
1925                 return error;
1926         }
1927         /*
1928          * Validate the magic number of the agi block.
1929          */
1930         agi = XFS_BUF_TO_AGI(agibp);
1931         agi_ok =
1932                 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
1933                 XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
1934         if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
1935                         XFS_RANDOM_IUNLINK_REMOVE))) {
1936                 XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
1937                                      mp, agi);
1938                 xfs_trans_brelse(tp, agibp);
1939                 cmn_err(CE_WARN,
1940                         "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
1941                          mp->m_fsname);
1942                 return XFS_ERROR(EFSCORRUPTED);
1943         }
1944         /*
1945          * Get the index into the agi hash table for the
1946          * list this inode will go on.
1947          */
1948         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1949         ASSERT(agino != 0);
1950         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1951         ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO);
1952         ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT));
1953
1954         if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) {
1955                 /*
1956                  * We're at the head of the list.  Get the inode's
1957                  * on-disk buffer to see if there is anyone after us
1958                  * on the list.  Only modify our next pointer if it
1959                  * is not already NULLAGINO.  This saves us the overhead
1960                  * of dealing with the buffer when there is no need to
1961                  * change it.
1962                  */
1963                 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1964                 if (error) {
1965                         cmn_err(CE_WARN,
1966                                 "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
1967                                 error, mp->m_fsname);
1968                         return error;
1969                 }
1970                 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
1971                 ASSERT(next_agino != 0);
1972                 if (next_agino != NULLAGINO) {
1973                         INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
1974                         offset = ip->i_boffset +
1975                                 offsetof(xfs_dinode_t, di_next_unlinked);
1976                         xfs_trans_inode_buf(tp, ibp);
1977                         xfs_trans_log_buf(tp, ibp, offset,
1978                                           (offset + sizeof(xfs_agino_t) - 1));
1979                         xfs_inobp_check(mp, ibp);
1980                 } else {
1981                         xfs_trans_brelse(tp, ibp);
1982                 }
1983                 /*
1984                  * Point the bucket head pointer at the next inode.
1985                  */
1986                 ASSERT(next_agino != 0);
1987                 ASSERT(next_agino != agino);
1988                 INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino);
1989                 offset = offsetof(xfs_agi_t, agi_unlinked) +
1990                         (sizeof(xfs_agino_t) * bucket_index);
1991                 xfs_trans_log_buf(tp, agibp, offset,
1992                                   (offset + sizeof(xfs_agino_t) - 1));
1993         } else {
1994                 /*
1995                  * We need to search the list for the inode being freed.
1996                  */
1997                 next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT);
1998                 last_ibp = NULL;
1999                 while (next_agino != agino) {
2000                         /*
2001                          * If the last inode wasn't the one pointing to
2002                          * us, then release its buffer since we're not
2003                          * going to do anything with it.
2004                          */
2005                         if (last_ibp != NULL) {
2006                                 xfs_trans_brelse(tp, last_ibp);
2007                         }
2008                         next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2009                         error = xfs_inotobp(mp, tp, next_ino, &last_dip,
2010                                             &last_ibp, &last_offset);
2011                         if (error) {
2012                                 cmn_err(CE_WARN,
2013                         "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
2014                                         error, mp->m_fsname);
2015                                 return error;
2016                         }
2017                         next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
2018                         ASSERT(next_agino != NULLAGINO);
2019                         ASSERT(next_agino != 0);
2020                 }
2021                 /*
2022                  * Now last_ibp points to the buffer previous to us on
2023                  * the unlinked list.  Pull us from the list.
2024                  */
2025                 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
2026                 if (error) {
2027                         cmn_err(CE_WARN,
2028                                 "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
2029                                 error, mp->m_fsname);
2030                         return error;
2031                 }
2032                 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
2033                 ASSERT(next_agino != 0);
2034                 ASSERT(next_agino != agino);
2035                 if (next_agino != NULLAGINO) {
2036                         INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
2037                         offset = ip->i_boffset +
2038                                 offsetof(xfs_dinode_t, di_next_unlinked);
2039                         xfs_trans_inode_buf(tp, ibp);
2040                         xfs_trans_log_buf(tp, ibp, offset,
2041                                           (offset + sizeof(xfs_agino_t) - 1));
2042                         xfs_inobp_check(mp, ibp);
2043                 } else {
2044                         xfs_trans_brelse(tp, ibp);
2045                 }
2046                 /*
2047                  * Point the previous inode on the list to the next inode.
2048                  */
2049                 INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
2050                 ASSERT(next_agino != 0);
2051                 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2052                 xfs_trans_inode_buf(tp, last_ibp);
2053                 xfs_trans_log_buf(tp, last_ibp, offset,
2054                                   (offset + sizeof(xfs_agino_t) - 1));
2055                 xfs_inobp_check(mp, last_ibp);
2056         }
2057         return 0;
2058 }
2059
2060 static __inline__ int xfs_inode_clean(xfs_inode_t *ip)
2061 {
2062         return (((ip->i_itemp == NULL) ||
2063                 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
2064                 (ip->i_update_core == 0));
2065 }
2066
2067 STATIC void
2068 xfs_ifree_cluster(
2069         xfs_inode_t     *free_ip,
2070         xfs_trans_t     *tp,
2071         xfs_ino_t       inum)
2072 {
2073         xfs_mount_t             *mp = free_ip->i_mount;
2074         int                     blks_per_cluster;
2075         int                     nbufs;
2076         int                     ninodes;
2077         int                     i, j, found, pre_flushed;
2078         xfs_daddr_t             blkno;
2079         xfs_buf_t               *bp;
2080         xfs_ihash_t             *ih;
2081         xfs_inode_t             *ip, **ip_found;
2082         xfs_inode_log_item_t    *iip;
2083         xfs_log_item_t          *lip;
2084         SPLDECL(s);
2085
2086         if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2087                 blks_per_cluster = 1;
2088                 ninodes = mp->m_sb.sb_inopblock;
2089                 nbufs = XFS_IALLOC_BLOCKS(mp);
2090         } else {
2091                 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2092                                         mp->m_sb.sb_blocksize;
2093                 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2094                 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2095         }
2096
2097         ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
2098
2099         for (j = 0; j < nbufs; j++, inum += ninodes) {
2100                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2101                                          XFS_INO_TO_AGBNO(mp, inum));
2102
2103
2104                 /*
2105                  * Look for each inode in memory and attempt to lock it,
2106                  * we can be racing with flush and tail pushing here.
2107                  * any inode we get the locks on, add to an array of
2108                  * inode items to process later.
2109                  *
2110                  * The get the buffer lock, we could beat a flush
2111                  * or tail pushing thread to the lock here, in which
2112                  * case they will go looking for the inode buffer
2113                  * and fail, we need some other form of interlock
2114                  * here.
2115                  */
2116                 found = 0;
2117                 for (i = 0; i < ninodes; i++) {
2118                         ih = XFS_IHASH(mp, inum + i);
2119                         read_lock(&ih->ih_lock);
2120                         for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
2121                                 if (ip->i_ino == inum + i)
2122                                         break;
2123                         }
2124
2125                         /* Inode not in memory or we found it already,
2126                          * nothing to do
2127                          */
2128                         if (!ip || (ip->i_flags & XFS_ISTALE)) {
2129                                 read_unlock(&ih->ih_lock);
2130                                 continue;
2131                         }
2132
2133                         if (xfs_inode_clean(ip)) {
2134                                 read_unlock(&ih->ih_lock);
2135                                 continue;
2136                         }
2137
2138                         /* If we can get the locks then add it to the
2139                          * list, otherwise by the time we get the bp lock
2140                          * below it will already be attached to the
2141                          * inode buffer.
2142                          */
2143
2144                         /* This inode will already be locked - by us, lets
2145                          * keep it that way.
2146                          */
2147
2148                         if (ip == free_ip) {
2149                                 if (xfs_iflock_nowait(ip)) {
2150                                         ip->i_flags |= XFS_ISTALE;
2151
2152                                         if (xfs_inode_clean(ip)) {
2153                                                 xfs_ifunlock(ip);
2154                                         } else {
2155                                                 ip_found[found++] = ip;
2156                                         }
2157                                 }
2158                                 read_unlock(&ih->ih_lock);
2159                                 continue;
2160                         }
2161
2162                         if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2163                                 if (xfs_iflock_nowait(ip)) {
2164                                         ip->i_flags |= XFS_ISTALE;
2165
2166                                         if (xfs_inode_clean(ip)) {
2167                                                 xfs_ifunlock(ip);
2168                                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2169                                         } else {
2170                                                 ip_found[found++] = ip;
2171                                         }
2172                                 } else {
2173                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2174                                 }
2175                         }
2176
2177                         read_unlock(&ih->ih_lock);
2178                 }
2179
2180                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2181                                         mp->m_bsize * blks_per_cluster,
2182                                         XFS_BUF_LOCK);
2183
2184                 pre_flushed = 0;
2185                 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
2186                 while (lip) {
2187                         if (lip->li_type == XFS_LI_INODE) {
2188                                 iip = (xfs_inode_log_item_t *)lip;
2189                                 ASSERT(iip->ili_logged == 1);
2190                                 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2191                                 AIL_LOCK(mp,s);
2192                                 iip->ili_flush_lsn = iip->ili_item.li_lsn;
2193                                 AIL_UNLOCK(mp, s);
2194                                 iip->ili_inode->i_flags |= XFS_ISTALE;
2195                                 pre_flushed++;
2196                         }
2197                         lip = lip->li_bio_list;
2198                 }
2199
2200                 for (i = 0; i < found; i++) {
2201                         ip = ip_found[i];
2202                         iip = ip->i_itemp;
2203
2204                         if (!iip) {
2205                                 ip->i_update_core = 0;
2206                                 xfs_ifunlock(ip);
2207                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2208                                 continue;
2209                         }
2210
2211                         iip->ili_last_fields = iip->ili_format.ilf_fields;
2212                         iip->ili_format.ilf_fields = 0;
2213                         iip->ili_logged = 1;
2214                         AIL_LOCK(mp,s);
2215                         iip->ili_flush_lsn = iip->ili_item.li_lsn;
2216                         AIL_UNLOCK(mp, s);
2217
2218                         xfs_buf_attach_iodone(bp,
2219                                 (void(*)(xfs_buf_t*,xfs_log_item_t*))
2220                                 xfs_istale_done, (xfs_log_item_t *)iip);
2221                         if (ip != free_ip) {
2222                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2223                         }
2224                 }
2225
2226                 if (found || pre_flushed)
2227                         xfs_trans_stale_inode_buf(tp, bp);
2228                 xfs_trans_binval(tp, bp);
2229         }
2230
2231         kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
2232 }
2233
2234 /*
2235  * This is called to return an inode to the inode free list.
2236  * The inode should already be truncated to 0 length and have
2237  * no pages associated with it.  This routine also assumes that
2238  * the inode is already a part of the transaction.
2239  *
2240  * The on-disk copy of the inode will have been added to the list
2241  * of unlinked inodes in the AGI. We need to remove the inode from
2242  * that list atomically with respect to freeing it here.
2243  */
2244 int
2245 xfs_ifree(
2246         xfs_trans_t     *tp,
2247         xfs_inode_t     *ip,
2248         xfs_bmap_free_t *flist)
2249 {
2250         int                     error;
2251         int                     delete;
2252         xfs_ino_t               first_ino;
2253
2254         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2255         ASSERT(ip->i_transp == tp);
2256         ASSERT(ip->i_d.di_nlink == 0);
2257         ASSERT(ip->i_d.di_nextents == 0);
2258         ASSERT(ip->i_d.di_anextents == 0);
2259         ASSERT((ip->i_d.di_size == 0) ||
2260                ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
2261         ASSERT(ip->i_d.di_nblocks == 0);
2262
2263         /*
2264          * Pull the on-disk inode from the AGI unlinked list.
2265          */
2266         error = xfs_iunlink_remove(tp, ip);
2267         if (error != 0) {
2268                 return error;
2269         }
2270
2271         error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2272         if (error != 0) {
2273                 return error;
2274         }
2275         ip->i_d.di_mode = 0;            /* mark incore inode as free */
2276         ip->i_d.di_flags = 0;
2277         ip->i_d.di_dmevmask = 0;
2278         ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
2279         ip->i_df.if_ext_max =
2280                 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
2281         ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2282         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2283         /*
2284          * Bump the generation count so no one will be confused
2285          * by reincarnations of this inode.
2286          */
2287         ip->i_d.di_gen++;
2288         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2289
2290         if (delete) {
2291                 xfs_ifree_cluster(ip, tp, first_ino);
2292         }
2293
2294         return 0;
2295 }
2296
2297 /*
2298  * Reallocate the space for if_broot based on the number of records
2299  * being added or deleted as indicated in rec_diff.  Move the records
2300  * and pointers in if_broot to fit the new size.  When shrinking this
2301  * will eliminate holes between the records and pointers created by
2302  * the caller.  When growing this will create holes to be filled in
2303  * by the caller.
2304  *
2305  * The caller must not request to add more records than would fit in
2306  * the on-disk inode root.  If the if_broot is currently NULL, then
2307  * if we adding records one will be allocated.  The caller must also
2308  * not request that the number of records go below zero, although
2309  * it can go to zero.
2310  *
2311  * ip -- the inode whose if_broot area is changing
2312  * ext_diff -- the change in the number of records, positive or negative,
2313  *       requested for the if_broot array.
2314  */
2315 void
2316 xfs_iroot_realloc(
2317         xfs_inode_t             *ip,
2318         int                     rec_diff,
2319         int                     whichfork)
2320 {
2321         int                     cur_max;
2322         xfs_ifork_t             *ifp;
2323         xfs_bmbt_block_t        *new_broot;
2324         int                     new_max;
2325         size_t                  new_size;
2326         char                    *np;
2327         char                    *op;
2328
2329         /*
2330          * Handle the degenerate case quietly.
2331          */
2332         if (rec_diff == 0) {
2333                 return;
2334         }
2335
2336         ifp = XFS_IFORK_PTR(ip, whichfork);
2337         if (rec_diff > 0) {
2338                 /*
2339                  * If there wasn't any memory allocated before, just
2340                  * allocate it now and get out.
2341                  */
2342                 if (ifp->if_broot_bytes == 0) {
2343                         new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2344                         ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
2345                                                                      KM_SLEEP);
2346                         ifp->if_broot_bytes = (int)new_size;
2347                         return;
2348                 }
2349
2350                 /*
2351                  * If there is already an existing if_broot, then we need
2352                  * to realloc() it and shift the pointers to their new
2353                  * location.  The records don't change location because
2354                  * they are kept butted up against the btree block header.
2355                  */
2356                 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2357                 new_max = cur_max + rec_diff;
2358                 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2359                 ifp->if_broot = (xfs_bmbt_block_t *)
2360                   kmem_realloc(ifp->if_broot,
2361                                 new_size,
2362                                 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2363                                 KM_SLEEP);
2364                 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2365                                                       ifp->if_broot_bytes);
2366                 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2367                                                       (int)new_size);
2368                 ifp->if_broot_bytes = (int)new_size;
2369                 ASSERT(ifp->if_broot_bytes <=
2370                         XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2371                 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2372                 return;
2373         }
2374
2375         /*
2376          * rec_diff is less than 0.  In this case, we are shrinking the
2377          * if_broot buffer.  It must already exist.  If we go to zero
2378          * records, just get rid of the root and clear the status bit.
2379          */
2380         ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2381         cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2382         new_max = cur_max + rec_diff;
2383         ASSERT(new_max >= 0);
2384         if (new_max > 0)
2385                 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2386         else
2387                 new_size = 0;
2388         if (new_size > 0) {
2389                 new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
2390                 /*
2391                  * First copy over the btree block header.
2392                  */
2393                 memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
2394         } else {
2395                 new_broot = NULL;
2396                 ifp->if_flags &= ~XFS_IFBROOT;
2397         }
2398
2399         /*
2400          * Only copy the records and pointers if there are any.
2401          */
2402         if (new_max > 0) {
2403                 /*
2404                  * First copy the records.
2405                  */
2406                 op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
2407                                                      ifp->if_broot_bytes);
2408                 np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
2409                                                      (int)new_size);
2410                 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2411
2412                 /*
2413                  * Then copy the pointers.
2414                  */
2415                 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2416                                                      ifp->if_broot_bytes);
2417                 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
2418                                                      (int)new_size);
2419                 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2420         }
2421         kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2422         ifp->if_broot = new_broot;
2423         ifp->if_broot_bytes = (int)new_size;
2424         ASSERT(ifp->if_broot_bytes <=
2425                 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2426         return;
2427 }
2428
2429
2430 /*
2431  * This is called when the amount of space needed for if_extents
2432  * is increased or decreased.  The change in size is indicated by
2433  * the number of extents that need to be added or deleted in the
2434  * ext_diff parameter.
2435  *
2436  * If the amount of space needed has decreased below the size of the
2437  * inline buffer, then switch to using the inline buffer.  Otherwise,
2438  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2439  * to what is needed.
2440  *
2441  * ip -- the inode whose if_extents area is changing
2442  * ext_diff -- the change in the number of extents, positive or negative,
2443  *       requested for the if_extents array.
2444  */
2445 void
2446 xfs_iext_realloc(
2447         xfs_inode_t     *ip,
2448         int             ext_diff,
2449         int             whichfork)
2450 {
2451         int             byte_diff;
2452         xfs_ifork_t     *ifp;
2453         int             new_size;
2454         uint            rnew_size;
2455
2456         if (ext_diff == 0) {
2457                 return;
2458         }
2459
2460         ifp = XFS_IFORK_PTR(ip, whichfork);
2461         byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
2462         new_size = (int)ifp->if_bytes + byte_diff;
2463         ASSERT(new_size >= 0);
2464
2465         if (new_size == 0) {
2466                 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
2467                         ASSERT(ifp->if_real_bytes != 0);
2468                         kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
2469                 }
2470                 ifp->if_u1.if_extents = NULL;
2471                 rnew_size = 0;
2472         } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
2473                 /*
2474                  * If the valid extents can fit in if_inline_ext,
2475                  * copy them from the malloc'd vector and free it.
2476                  */
2477                 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
2478                         /*
2479                          * For now, empty files are format EXTENTS,
2480                          * so the if_extents pointer is null.
2481                          */
2482                         if (ifp->if_u1.if_extents) {
2483                                 memcpy(ifp->if_u2.if_inline_ext,
2484                                         ifp->if_u1.if_extents, new_size);
2485                                 kmem_free(ifp->if_u1.if_extents,
2486                                           ifp->if_real_bytes);
2487                         }
2488                         ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
2489                 }
2490                 rnew_size = 0;
2491         } else {
2492                 rnew_size = new_size;
2493                 if ((rnew_size & (rnew_size - 1)) != 0)
2494                         rnew_size = xfs_iroundup(rnew_size);
2495                 /*
2496                  * Stuck with malloc/realloc.
2497                  */
2498                 if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
2499                         ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
2500                                 kmem_alloc(rnew_size, KM_SLEEP);
2501                         memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
2502                               sizeof(ifp->if_u2.if_inline_ext));
2503                 } else if (rnew_size != ifp->if_real_bytes) {
2504                         ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
2505                           kmem_realloc(ifp->if_u1.if_extents,
2506                                         rnew_size,
2507                                         ifp->if_real_bytes,
2508                                         KM_NOFS);
2509                 }
2510         }
2511         ifp->if_real_bytes = rnew_size;
2512         ifp->if_bytes = new_size;
2513 }
2514
2515
2516 /*
2517  * This is called when the amount of space needed for if_data
2518  * is increased or decreased.  The change in size is indicated by
2519  * the number of bytes that need to be added or deleted in the
2520  * byte_diff parameter.
2521  *
2522  * If the amount of space needed has decreased below the size of the
2523  * inline buffer, then switch to using the inline buffer.  Otherwise,
2524  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2525  * to what is needed.
2526  *
2527  * ip -- the inode whose if_data area is changing
2528  * byte_diff -- the change in the number of bytes, positive or negative,
2529  *       requested for the if_data array.
2530  */
2531 void
2532 xfs_idata_realloc(
2533         xfs_inode_t     *ip,
2534         int             byte_diff,
2535         int             whichfork)
2536 {
2537         xfs_ifork_t     *ifp;
2538         int             new_size;
2539         int             real_size;
2540
2541         if (byte_diff == 0) {
2542                 return;
2543         }
2544
2545         ifp = XFS_IFORK_PTR(ip, whichfork);
2546         new_size = (int)ifp->if_bytes + byte_diff;
2547         ASSERT(new_size >= 0);
2548
2549         if (new_size == 0) {
2550                 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2551                         kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2552                 }
2553                 ifp->if_u1.if_data = NULL;
2554                 real_size = 0;
2555         } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2556                 /*
2557                  * If the valid extents/data can fit in if_inline_ext/data,
2558                  * copy them from the malloc'd vector and free it.
2559                  */
2560                 if (ifp->if_u1.if_data == NULL) {
2561                         ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2562                 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2563                         ASSERT(ifp->if_real_bytes != 0);
2564                         memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2565                               new_size);
2566                         kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2567                         ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2568                 }
2569                 real_size = 0;
2570         } else {
2571                 /*
2572                  * Stuck with malloc/realloc.
2573                  * For inline data, the underlying buffer must be
2574                  * a multiple of 4 bytes in size so that it can be
2575                  * logged and stay on word boundaries.  We enforce
2576                  * that here.
2577                  */
2578                 real_size = roundup(new_size, 4);
2579                 if (ifp->if_u1.if_data == NULL) {
2580                         ASSERT(ifp->if_real_bytes == 0);
2581                         ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2582                 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2583                         /*
2584                          * Only do the realloc if the underlying size
2585                          * is really changing.
2586                          */
2587                         if (ifp->if_real_bytes != real_size) {
2588                                 ifp->if_u1.if_data =
2589                                         kmem_realloc(ifp->if_u1.if_data,
2590                                                         real_size,
2591                                                         ifp->if_real_bytes,
2592                                                         KM_SLEEP);
2593                         }
2594                 } else {
2595                         ASSERT(ifp->if_real_bytes == 0);
2596                         ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2597                         memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2598                                 ifp->if_bytes);
2599                 }
2600         }
2601         ifp->if_real_bytes = real_size;
2602         ifp->if_bytes = new_size;
2603         ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2604 }
2605
2606
2607
2608
2609 /*
2610  * Map inode to disk block and offset.
2611  *
2612  * mp -- the mount point structure for the current file system
2613  * tp -- the current transaction
2614  * ino -- the inode number of the inode to be located
2615  * imap -- this structure is filled in with the information necessary
2616  *       to retrieve the given inode from disk
2617  * flags -- flags to pass to xfs_dilocate indicating whether or not
2618  *       lookups in the inode btree were OK or not
2619  */
2620 int
2621 xfs_imap(
2622         xfs_mount_t     *mp,
2623         xfs_trans_t     *tp,
2624         xfs_ino_t       ino,
2625         xfs_imap_t      *imap,
2626         uint            flags)
2627 {
2628         xfs_fsblock_t   fsbno;
2629         int             len;
2630         int             off;
2631         int             error;
2632
2633         fsbno = imap->im_blkno ?
2634                 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2635         error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2636         if (error != 0) {
2637                 return error;
2638         }
2639         imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2640         imap->im_len = XFS_FSB_TO_BB(mp, len);
2641         imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2642         imap->im_ioffset = (ushort)off;
2643         imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2644         return 0;
2645 }
2646
2647 void
2648 xfs_idestroy_fork(
2649         xfs_inode_t     *ip,
2650         int             whichfork)
2651 {
2652         xfs_ifork_t     *ifp;
2653
2654         ifp = XFS_IFORK_PTR(ip, whichfork);
2655         if (ifp->if_broot != NULL) {
2656                 kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2657                 ifp->if_broot = NULL;
2658         }
2659
2660         /*
2661          * If the format is local, then we can't have an extents
2662          * array so just look for an inline data array.  If we're
2663          * not local then we may or may not have an extents list,
2664          * so check and free it up if we do.
2665          */
2666         if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2667                 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2668                     (ifp->if_u1.if_data != NULL)) {
2669                         ASSERT(ifp->if_real_bytes != 0);
2670                         kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2671                         ifp->if_u1.if_data = NULL;
2672                         ifp->if_real_bytes = 0;
2673                 }
2674         } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2675                    (ifp->if_u1.if_extents != NULL) &&
2676                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
2677                 ASSERT(ifp->if_real_bytes != 0);
2678                 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
2679                 ifp->if_u1.if_extents = NULL;
2680                 ifp->if_real_bytes = 0;
2681         }
2682         ASSERT(ifp->if_u1.if_extents == NULL ||
2683                ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2684         ASSERT(ifp->if_real_bytes == 0);
2685         if (whichfork == XFS_ATTR_FORK) {
2686                 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2687                 ip->i_afp = NULL;
2688         }
2689 }
2690
2691 /*
2692  * This is called free all the memory associated with an inode.
2693  * It must free the inode itself and any buffers allocated for
2694  * if_extents/if_data and if_broot.  It must also free the lock
2695  * associated with the inode.
2696  */
2697 void
2698 xfs_idestroy(
2699         xfs_inode_t     *ip)
2700 {
2701
2702         switch (ip->i_d.di_mode & S_IFMT) {
2703         case S_IFREG:
2704         case S_IFDIR:
2705         case S_IFLNK:
2706                 xfs_idestroy_fork(ip, XFS_DATA_FORK);
2707                 break;
2708         }
2709         if (ip->i_afp)
2710                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2711         mrfree(&ip->i_lock);
2712         mrfree(&ip->i_iolock);
2713         freesema(&ip->i_flock);
2714 #ifdef XFS_BMAP_TRACE
2715         ktrace_free(ip->i_xtrace);
2716 #endif
2717 #ifdef XFS_BMBT_TRACE
2718         ktrace_free(ip->i_btrace);
2719 #endif
2720 #ifdef XFS_RW_TRACE
2721         ktrace_free(ip->i_rwtrace);
2722 #endif
2723 #ifdef XFS_ILOCK_TRACE
2724         ktrace_free(ip->i_lock_trace);
2725 #endif
2726 #ifdef XFS_DIR2_TRACE
2727         ktrace_free(ip->i_dir_trace);
2728 #endif
2729         if (ip->i_itemp) {
2730                 /* XXXdpd should be able to assert this but shutdown
2731                  * is leaving the AIL behind. */
2732                 ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
2733                        XFS_FORCED_SHUTDOWN(ip->i_mount));
2734                 xfs_inode_item_destroy(ip);
2735         }
2736         kmem_zone_free(xfs_inode_zone, ip);
2737 }
2738
2739
2740 /*
2741  * Increment the pin count of the given buffer.
2742  * This value is protected by ipinlock spinlock in the mount structure.
2743  */
2744 void
2745 xfs_ipin(
2746         xfs_inode_t     *ip)
2747 {
2748         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2749
2750         atomic_inc(&ip->i_pincount);
2751 }
2752
2753 /*
2754  * Decrement the pin count of the given inode, and wake up
2755  * anyone in xfs_iwait_unpin() if the count goes to 0.  The
2756  * inode must have been previoulsy pinned with a call to xfs_ipin().
2757  */
2758 void
2759 xfs_iunpin(
2760         xfs_inode_t     *ip)
2761 {
2762         ASSERT(atomic_read(&ip->i_pincount) > 0);
2763
2764         if (atomic_dec_and_test(&ip->i_pincount)) {
2765 #if XXXKAN
2766                 /*
2767                  * Should I mark FreeBSD vnode as dirty here?
2768                  */
2769                 printf("%s:%d: Should I mark FreeBSD vnode as dirty here?\n",
2770                     __FILE__, __LINE__);
2771                 xfs_vnode_t     *vp = XFS_ITOV_NULL(ip);
2772
2773                 /* make sync come back and flush this inode */
2774                 if (vp) {
2775                         struct inode    *inode = LINVFS_GET_IP(vp);
2776
2777                         if (!(inode->i_state & I_NEW))
2778                                 mark_inode_dirty_sync(inode);
2779                 }
2780 #endif
2781
2782                 wakeup(&ip->i_ipin_wait);
2783         }
2784 }
2785
2786 /*
2787  * This is called to wait for the given inode to be unpinned.
2788  * It will sleep until this happens.  The caller must have the
2789  * inode locked in at least shared mode so that the buffer cannot
2790  * be subsequently pinned once someone is waiting for it to be
2791  * unpinned.
2792  */
2793 STATIC void
2794 xfs_iunpin_wait(
2795         xfs_inode_t     *ip)
2796 {
2797         xfs_inode_log_item_t    *iip;
2798         xfs_lsn_t       lsn;
2799
2800         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
2801
2802         if (atomic_read(&ip->i_pincount) == 0) {
2803                 return;
2804         }
2805
2806         iip = ip->i_itemp;
2807         if (iip && iip->ili_last_lsn) {
2808                 lsn = iip->ili_last_lsn;
2809         } else {
2810                 lsn = (xfs_lsn_t)0;
2811         }
2812
2813         /*
2814          * Give the log a push so we don't wait here too long.
2815          */
2816         xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
2817
2818         /*
2819          * XXXKAN: xfs_iunpin is not locking inode
2820          * at all?
2821          */
2822         while(atomic_read(&ip->i_pincount) != 0)
2823                 tsleep(&ip->i_ipin_wait, PRIBIO, "iunpin", 0);
2824 }
2825
2826
2827 /*
2828  * xfs_iextents_copy()
2829  *
2830  * This is called to copy the REAL extents (as opposed to the delayed
2831  * allocation extents) from the inode into the given buffer.  It
2832  * returns the number of bytes copied into the buffer.
2833  *
2834  * If there are no delayed allocation extents, then we can just
2835  * memcpy() the extents into the buffer.  Otherwise, we need to
2836  * examine each extent in turn and skip those which are delayed.
2837  */
2838 int
2839 xfs_iextents_copy(
2840         xfs_inode_t             *ip,
2841         xfs_bmbt_rec_t          *buffer,
2842         int                     whichfork)
2843 {
2844         int                     copied;
2845         xfs_bmbt_rec_t          *dest_ep;
2846         xfs_bmbt_rec_t          *ep;
2847 #ifdef XFS_BMAP_TRACE
2848         static char             fname[] = "xfs_iextents_copy";
2849 #endif
2850         int                     i;
2851         xfs_ifork_t             *ifp;
2852         int                     nrecs;
2853         xfs_fsblock_t           start_block;
2854
2855         ifp = XFS_IFORK_PTR(ip, whichfork);
2856         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
2857         ASSERT(ifp->if_bytes > 0);
2858
2859         nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2860         xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
2861         ASSERT(nrecs > 0);
2862
2863         /*
2864          * There are some delayed allocation extents in the
2865          * inode, so copy the extents one at a time and skip
2866          * the delayed ones.  There must be at least one
2867          * non-delayed extent.
2868          */
2869         ep = ifp->if_u1.if_extents;
2870         dest_ep = buffer;
2871         copied = 0;
2872         for (i = 0; i < nrecs; i++) {
2873                 start_block = xfs_bmbt_get_startblock(ep);
2874                 if (ISNULLSTARTBLOCK(start_block)) {
2875                         /*
2876                          * It's a delayed allocation extent, so skip it.
2877                          */
2878                         ep++;
2879                         continue;
2880                 }
2881
2882                 /* Translate to on disk format */
2883                 put_unaligned(INT_GET(ep->l0, ARCH_CONVERT),
2884                               (__uint64_t*)&dest_ep->l0);
2885                 put_unaligned(INT_GET(ep->l1, ARCH_CONVERT),
2886                               (__uint64_t*)&dest_ep->l1);
2887                 dest_ep++;
2888                 ep++;
2889                 copied++;
2890         }
2891         ASSERT(copied != 0);
2892         xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip));
2893
2894         return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2895 }
2896
2897 /*
2898  * Each of the following cases stores data into the same region
2899  * of the on-disk inode, so only one of them can be valid at
2900  * any given time. While it is possible to have conflicting formats
2901  * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2902  * in EXTENTS format, this can only happen when the fork has
2903  * changed formats after being modified but before being flushed.
2904  * In these cases, the format always takes precedence, because the
2905  * format indicates the current state of the fork.
2906  */
2907 /*ARGSUSED*/
2908 STATIC int
2909 xfs_iflush_fork(
2910         xfs_inode_t             *ip,
2911         xfs_dinode_t            *dip,
2912         xfs_inode_log_item_t    *iip,
2913         int                     whichfork,
2914         xfs_buf_t               *bp)
2915 {
2916         char                    *cp;
2917         xfs_ifork_t             *ifp;
2918         xfs_mount_t             *mp;
2919 #ifdef XFS_TRANS_DEBUG
2920         int                     first;
2921 #endif
2922         static const short      brootflag[2] =
2923                 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2924         static const short      dataflag[2] =
2925                 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2926         static const short      extflag[2] =
2927                 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2928
2929         if (iip == NULL)
2930                 return 0;
2931         ifp = XFS_IFORK_PTR(ip, whichfork);
2932         /*
2933          * This can happen if we gave up in iformat in an error path,
2934          * for the attribute fork.
2935          */
2936         if (ifp == NULL) {
2937                 ASSERT(whichfork == XFS_ATTR_FORK);
2938                 return 0;
2939         }
2940         cp = XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
2941         mp = ip->i_mount;
2942         switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2943         case XFS_DINODE_FMT_LOCAL:
2944                 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
2945                     (ifp->if_bytes > 0)) {
2946                         ASSERT(ifp->if_u1.if_data != NULL);
2947                         ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2948                         memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2949                 }
2950                 if (whichfork == XFS_DATA_FORK) {
2951                         if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
2952                                 XFS_ERROR_REPORT("xfs_iflush_fork",
2953                                                  XFS_ERRLEVEL_LOW, mp);
2954                                 return XFS_ERROR(EFSCORRUPTED);
2955                         }
2956                 }
2957                 break;
2958
2959         case XFS_DINODE_FMT_EXTENTS:
2960                 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2961                        !(iip->ili_format.ilf_fields & extflag[whichfork]));
2962                 ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
2963                 ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
2964                 if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2965                     (ifp->if_bytes > 0)) {
2966                         ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2967                         (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2968                                 whichfork);
2969                 }
2970                 break;
2971
2972         case XFS_DINODE_FMT_BTREE:
2973                 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
2974                     (ifp->if_broot_bytes > 0)) {
2975                         ASSERT(ifp->if_broot != NULL);
2976                         ASSERT(ifp->if_broot_bytes <=
2977                                (XFS_IFORK_SIZE(ip, whichfork) +
2978                                 XFS_BROOT_SIZE_ADJ));
2979                         xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
2980                                 (xfs_bmdr_block_t *)cp,
2981                                 XFS_DFORK_SIZE_ARCH(dip, mp, whichfork, ARCH_CONVERT));
2982                 }
2983                 break;
2984
2985         case XFS_DINODE_FMT_DEV:
2986                 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
2987                         ASSERT(whichfork == XFS_DATA_FORK);
2988                         INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
2989                 }
2990                 break;
2991
2992         case XFS_DINODE_FMT_UUID:
2993                 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
2994                         ASSERT(whichfork == XFS_DATA_FORK);
2995                         memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
2996                                 sizeof(uuid_t));
2997                 }
2998                 break;
2999
3000         default:
3001                 ASSERT(0);
3002                 break;
3003         }
3004
3005         return 0;
3006 }
3007
3008 /*
3009  * xfs_iflush() will write a modified inode's changes out to the
3010  * inode's on disk home.  The caller must have the inode lock held
3011  * in at least shared mode and the inode flush semaphore must be
3012  * held as well.  The inode lock will still be held upon return from
3013  * the call and the caller is free to unlock it.
3014  * The inode flush lock will be unlocked when the inode reaches the disk.
3015  * The flags indicate how the inode's buffer should be written out.
3016  */
3017 int
3018 xfs_iflush(
3019         xfs_inode_t             *ip,
3020         uint                    flags)
3021 {
3022         xfs_inode_log_item_t    *iip;
3023         xfs_buf_t               *bp;
3024         xfs_dinode_t            *dip;
3025         xfs_mount_t             *mp;
3026         int                     error;
3027         /* REFERENCED */
3028         xfs_chash_t             *ch;
3029         xfs_inode_t             *iq;
3030         int                     clcount;        /* count of inodes clustered */
3031         int                     bufwasdelwri;
3032         enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3033         SPLDECL(s);
3034
3035         XFS_STATS_INC(xs_iflush_count);
3036
3037         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3038         ASSERT(valusema(&ip->i_flock) <= 0);
3039         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3040                ip->i_d.di_nextents > ip->i_df.if_ext_max);
3041
3042         iip = ip->i_itemp;
3043         mp = ip->i_mount;
3044
3045         /*
3046          * If the inode isn't dirty, then just release the inode
3047          * flush lock and do nothing.
3048          */
3049         if ((ip->i_update_core == 0) &&
3050             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3051                 ASSERT((iip != NULL) ?
3052                          !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
3053                 xfs_ifunlock(ip);
3054                 return 0;
3055         }
3056
3057         /*
3058          * We can't flush the inode until it is unpinned, so
3059          * wait for it.  We know noone new can pin it, because
3060          * we are holding the inode lock shared and you need
3061          * to hold it exclusively to pin the inode.
3062          */
3063         xfs_iunpin_wait(ip);
3064
3065         /*
3066          * This may have been unpinned because the filesystem is shutting
3067          * down forcibly. If that's the case we must not write this inode
3068          * to disk, because the log record didn't make it to disk!
3069          */
3070         if (XFS_FORCED_SHUTDOWN(mp)) {
3071                 ip->i_update_core = 0;
3072                 if (iip)
3073                         iip->ili_format.ilf_fields = 0;
3074                 xfs_ifunlock(ip);
3075                 return XFS_ERROR(EIO);
3076         }
3077
3078         /*
3079          * Get the buffer containing the on-disk inode.
3080          */
3081         error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
3082         if (error != 0) {
3083                 xfs_ifunlock(ip);
3084                 return error;
3085         }
3086
3087         /*
3088          * Decide how buffer will be flushed out.  This is done before
3089          * the call to xfs_iflush_int because this field is zeroed by it.
3090          */
3091         if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3092                 /*
3093                  * Flush out the inode buffer according to the directions
3094                  * of the caller.  In the cases where the caller has given
3095                  * us a choice choose the non-delwri case.  This is because
3096                  * the inode is in the AIL and we need to get it out soon.
3097                  */
3098                 switch (flags) {
3099                 case XFS_IFLUSH_SYNC:
3100                 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3101                         flags = 0;
3102                         break;
3103                 case XFS_IFLUSH_ASYNC:
3104                 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3105                         flags = INT_ASYNC;
3106                         break;
3107                 case XFS_IFLUSH_DELWRI:
3108                         flags = INT_DELWRI;
3109                         break;
3110                 default:
3111                         ASSERT(0);
3112                         flags = 0;
3113                         break;
3114                 }
3115         } else {
3116                 switch (flags) {
3117                 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3118                 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3119                 case XFS_IFLUSH_DELWRI:
3120                         flags = INT_DELWRI;
3121                         break;
3122                 case XFS_IFLUSH_ASYNC:
3123                         flags = INT_ASYNC;
3124                         break;
3125                 case XFS_IFLUSH_SYNC:
3126                         flags = 0;
3127                         break;
3128                 default:
3129                         ASSERT(0);
3130                         flags = 0;
3131                         break;
3132                 }
3133         }
3134
3135         /*
3136          * First flush out the inode that xfs_iflush was called with.
3137          */
3138         error = xfs_iflush_int(ip, bp);
3139         if (error) {
3140                 goto corrupt_out;
3141         }
3142
3143         /*
3144          * inode clustering:
3145          * see if other inodes can be gathered into this write
3146          */
3147
3148         ip->i_chash->chl_buf = bp;
3149
3150         ch = XFS_CHASH(mp, ip->i_blkno);
3151         s = mutex_spinlock(&ch->ch_lock);
3152
3153         clcount = 0;
3154         for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
3155                 /*
3156                  * Do an un-protected check to see if the inode is dirty and
3157                  * is a candidate for flushing.  These checks will be repeated
3158                  * later after the appropriate locks are acquired.
3159                  */
3160                 iip = iq->i_itemp;
3161                 if ((iq->i_update_core == 0) &&
3162                     ((iip == NULL) ||
3163                      !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
3164                       xfs_ipincount(iq) == 0) {
3165                         continue;
3166                 }
3167
3168                 /*
3169                  * Try to get locks.  If any are unavailable,
3170                  * then this inode cannot be flushed and is skipped.
3171                  */
3172
3173                 /* get inode locks (just i_lock) */
3174                 if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
3175                         /* get inode flush lock */
3176                         if (xfs_iflock_nowait(iq)) {
3177                                 /* check if pinned */
3178                                 if (xfs_ipincount(iq) == 0) {
3179                                         /* arriving here means that
3180                                          * this inode can be flushed.
3181                                          * first re-check that it's
3182                                          * dirty
3183                                          */
3184                                         iip = iq->i_itemp;
3185                                         if ((iq->i_update_core != 0)||
3186                                             ((iip != NULL) &&
3187                                              (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3188                                                 clcount++;
3189                                                 error = xfs_iflush_int(iq, bp);
3190                                                 if (error) {
3191                                                         xfs_iunlock(iq,
3192                                                                     XFS_ILOCK_SHARED);
3193                                                         goto cluster_corrupt_out;
3194                                                 }
3195                                         } else {
3196                                                 xfs_ifunlock(iq);
3197                                         }
3198                                 } else {
3199                                         xfs_ifunlock(iq);
3200                                 }
3201                         }
3202                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
3203                 }
3204         }
3205         mutex_spinunlock(&ch->ch_lock, s);
3206
3207         if (clcount) {
3208                 XFS_STATS_INC(xs_icluster_flushcnt);
3209                 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3210         }
3211
3212         /*
3213          * If the buffer is pinned then push on the log so we won't
3214          * get stuck waiting in the write for too long.
3215          */
3216         if (XFS_BUF_ISPINNED(bp)){
3217                 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3218         }
3219
3220         if (flags & INT_DELWRI) {
3221                 xfs_bdwrite(mp, bp);
3222         } else if (flags & INT_ASYNC) {
3223                 xfs_bawrite(mp, bp);
3224         } else {
3225                 error = xfs_bwrite(mp, bp);
3226         }
3227         return error;
3228
3229 corrupt_out:
3230         xfs_buf_relse(bp);
3231         xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
3232         xfs_iflush_abort(ip);
3233         /*
3234          * Unlocks the flush lock
3235          */
3236         return XFS_ERROR(EFSCORRUPTED);
3237
3238 cluster_corrupt_out:
3239         /* Corruption detected in the clustering loop.  Invalidate the
3240          * inode buffer and shut down the filesystem.
3241          */
3242         mutex_spinunlock(&ch->ch_lock, s);
3243
3244         /*
3245          * Clean up the buffer.  If it was B_DELWRI, just release it --
3246          * brelse can handle it with no problems.  If not, shut down the
3247          * filesystem before releasing the buffer.
3248          */
3249         if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
3250                 xfs_buf_relse(bp);
3251         }
3252
3253         xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
3254
3255         if(!bufwasdelwri)  {
3256                 /*
3257                  * Just like incore_relse: if we have b_iodone functions,
3258                  * mark the buffer as an error and call them.  Otherwise
3259                  * mark it as stale and brelse.
3260                  */
3261                 if (XFS_BUF_IODONE_FUNC(bp)) {
3262                         XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3263                         XFS_BUF_UNDONE(bp);
3264                         XFS_BUF_STALE(bp);
3265                         XFS_BUF_SHUT(bp);
3266                         XFS_BUF_ERROR(bp,EIO);
3267                         xfs_biodone(bp);
3268                 } else {
3269                         XFS_BUF_STALE(bp);
3270                         xfs_buf_relse(bp);
3271                 }
3272         }
3273
3274         xfs_iflush_abort(iq);
3275         /*
3276          * Unlocks the flush lock
3277          */
3278         return XFS_ERROR(EFSCORRUPTED);
3279 }
3280
3281
3282 STATIC int
3283 xfs_iflush_int(
3284         xfs_inode_t             *ip,
3285         xfs_buf_t               *bp)
3286 {
3287         xfs_inode_log_item_t    *iip;
3288         xfs_dinode_t            *dip;
3289         xfs_mount_t             *mp;
3290 #ifdef XFS_TRANS_DEBUG
3291         int                     first;
3292 #endif
3293         SPLDECL(s);
3294
3295         ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3296         ASSERT(valusema(&ip->i_flock) <= 0);
3297         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3298                ip->i_d.di_nextents > ip->i_df.if_ext_max);
3299
3300         iip = ip->i_itemp;
3301         mp = ip->i_mount;
3302
3303
3304         /*
3305          * If the inode isn't dirty, then just release the inode
3306          * flush lock and do nothing.
3307          */
3308         if ((ip->i_update_core == 0) &&
3309             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3310                 xfs_ifunlock(ip);
3311                 return 0;
3312         }
3313
3314         /* set *dip = inode's place in the buffer */
3315         dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
3316
3317         /*
3318          * Clear i_update_core before copying out the data.
3319          * This is for coordination with our timestamp updates
3320          * that don't hold the inode lock. They will always
3321          * update the timestamps BEFORE setting i_update_core,
3322          * so if we clear i_update_core after they set it we
3323          * are guaranteed to see their updates to the timestamps.
3324          * I believe that this depends on strongly ordered memory
3325          * semantics, but we have that.  We use the SYNCHRONIZE
3326          * macro to make sure that the compiler does not reorder
3327          * the i_update_core access below the data copy below.
3328          */
3329         ip->i_update_core = 0;
3330         SYNCHRONIZE();
3331
3332         if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
3333                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3334                 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3335                     "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3336                         ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
3337                 goto corrupt_out;
3338         }
3339         if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3340                                 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3341                 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3342                         "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3343                         ip->i_ino, ip, ip->i_d.di_magic);
3344                 goto corrupt_out;
3345         }
3346         if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3347                 if (XFS_TEST_ERROR(
3348                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3349                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3350                     mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3351                         xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3352                                 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
3353                                 ip->i_ino, ip);
3354                         goto corrupt_out;
3355                 }
3356         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
3357                 if (XFS_TEST_ERROR(
3358                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3359                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3360                     (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3361                     mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3362                         xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3363                                 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
3364                                 ip->i_ino, ip);
3365                         goto corrupt_out;
3366                 }
3367         }
3368         if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3369                                 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3370                                 XFS_RANDOM_IFLUSH_5)) {
3371                 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3372                         "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
3373                         ip->i_ino,
3374                         ip->i_d.di_nextents + ip->i_d.di_anextents,
3375                         ip->i_d.di_nblocks,
3376                         ip);
3377                 goto corrupt_out;
3378         }
3379         if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3380                                 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3381                 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3382                         "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3383                         ip->i_ino, ip->i_d.di_forkoff, ip);
3384                 goto corrupt_out;
3385         }
3386         /*
3387          * bump the flush iteration count, used to detect flushes which
3388          * postdate a log record during recovery.
3389          */
3390
3391         ip->i_d.di_flushiter++;
3392
3393         /*
3394          * Copy the dirty parts of the inode into the on-disk
3395          * inode.  We always copy out the core of the inode,
3396          * because if the inode is dirty at all the core must
3397          * be.
3398          */
3399         xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d),
3400                 -1, ARCH_CONVERT);
3401
3402         /* Wrap, we never let the log put out DI_MAX_FLUSH */
3403         if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3404                 ip->i_d.di_flushiter = 0;
3405
3406         /*
3407          * If this is really an old format inode and the superblock version
3408          * has not been updated to support only new format inodes, then
3409          * convert back to the old inode format.  If the superblock version
3410          * has been updated, then make the conversion permanent.
3411          */
3412         ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
3413                XFS_SB_VERSION_HASNLINK(&mp->m_sb));
3414         if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
3415                 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
3416                         /*
3417                          * Convert it back.
3418                          */
3419                         ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3420                         INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
3421                 } else {
3422                         /*
3423                          * The superblock version has already been bumped,
3424                          * so just make the conversion to the new inode
3425                          * format permanent.
3426                          */
3427                         ip->i_d.di_version = XFS_DINODE_VERSION_2;
3428                         INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
3429                         ip->i_d.di_onlink = 0;
3430                         INT_ZERO(dip->di_core.di_onlink, ARCH_CONVERT);
3431                         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3432                         memset(&(dip->di_core.di_pad[0]), 0,
3433                               sizeof(dip->di_core.di_pad));
3434                         ASSERT(ip->i_d.di_projid == 0);
3435                 }
3436         }
3437
3438         if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
3439                 goto corrupt_out;
3440         }
3441
3442         if (XFS_IFORK_Q(ip)) {
3443                 /*
3444                  * The only error from xfs_iflush_fork is on the data fork.
3445                  */
3446                 (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3447         }
3448         xfs_inobp_check(mp, bp);
3449
3450         /*
3451          * We've recorded everything logged in the inode, so we'd
3452          * like to clear the ilf_fields bits so we don't log and
3453          * flush things unnecessarily.  However, we can't stop
3454          * logging all this information until the data we've copied
3455          * into the disk buffer is written to disk.  If we did we might
3456          * overwrite the copy of the inode in the log with all the
3457          * data after re-logging only part of it, and in the face of
3458          * a crash we wouldn't have all the data we need to recover.
3459          *
3460          * What we do is move the bits to the ili_last_fields field.
3461          * When logging the inode, these bits are moved back to the
3462          * ilf_fields field.  In the xfs_iflush_done() routine we
3463          * clear ili_last_fields, since we know that the information
3464          * those bits represent is permanently on disk.  As long as
3465          * the flush completes before the inode is logged again, then
3466          * both ilf_fields and ili_last_fields will be cleared.
3467          *
3468          * We can play with the ilf_fields bits here, because the inode
3469          * lock must be held exclusively in order to set bits there
3470          * and the flush lock protects the ili_last_fields bits.
3471          * Set ili_logged so the flush done
3472          * routine can tell whether or not to look in the AIL.
3473          * Also, store the current LSN of the inode so that we can tell
3474          * whether the item has moved in the AIL from xfs_iflush_done().
3475          * In order to read the lsn we need the AIL lock, because
3476          * it is a 64 bit value that cannot be read atomically.
3477          */
3478         if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3479                 iip->ili_last_fields = iip->ili_format.ilf_fields;
3480                 iip->ili_format.ilf_fields = 0;
3481                 iip->ili_logged = 1;
3482
3483                 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
3484                 AIL_LOCK(mp,s);
3485                 iip->ili_flush_lsn = iip->ili_item.li_lsn;
3486                 AIL_UNLOCK(mp, s);
3487
3488                 /*
3489                  * Attach the function xfs_iflush_done to the inode's
3490                  * buffer.  This will remove the inode from the AIL
3491                  * and unlock the inode's flush lock when the inode is
3492                  * completely written to disk.
3493                  */
3494                 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
3495                                       xfs_iflush_done, (xfs_log_item_t *)iip);
3496
3497                 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3498                 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
3499         } else {
3500                 /*
3501                  * We're flushing an inode which is not in the AIL and has
3502                  * not been logged but has i_update_core set.  For this
3503                  * case we can use a B_DELWRI flush and immediately drop
3504                  * the inode flush lock because we can avoid the whole
3505                  * AIL state thing.  It's OK to drop the flush lock now,
3506                  * because we've already locked the buffer and to do anything
3507                  * you really need both.
3508                  */
3509                 if (iip != NULL) {
3510                         ASSERT(iip->ili_logged == 0);
3511                         ASSERT(iip->ili_last_fields == 0);
3512                         ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
3513                 }
3514                 xfs_ifunlock(ip);
3515         }
3516
3517         return 0;
3518
3519 corrupt_out:
3520         return XFS_ERROR(EFSCORRUPTED);
3521 }
3522
3523 /*
3524  * Flush all inactive inodes in mp.  Return true if no user references
3525  * were found, false otherwise.
3526  */
3527 int
3528 xfs_iflush_all(
3529         xfs_mount_t     *mp,
3530         int             flag)
3531 {
3532         int             busy;
3533         int             done;
3534         int             purged;
3535         xfs_inode_t     *ip;
3536         vmap_t          vmap;
3537         xfs_vnode_t     *vp;
3538
3539         busy = done = 0;
3540         while (!done) {
3541                 purged = 0;
3542                 XFS_MOUNT_ILOCK(mp);
3543                 ip = mp->m_inodes;
3544                 if (ip == NULL) {
3545                         break;
3546                 }
3547                 do {
3548                         /* Make sure we skip markers inserted by sync */
3549                         if (ip->i_mount == NULL) {
3550                                 ip = ip->i_mnext;
3551                                 continue;
3552                         }
3553
3554                         /*
3555                          * It's up to our caller to purge the root
3556                          * and quota vnodes later.
3557                          */
3558                         vp = XFS_ITOV_NULL(ip);
3559
3560                         if (!vp) {
3561                                 XFS_MOUNT_IUNLOCK(mp);
3562                                 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3563                                 purged = 1;
3564                                 break;
3565                         }
3566
3567                         if (vn_count(vp) != 0) {
3568                                 if (vn_count(vp) == 1 &&
3569                                     (ip == mp->m_rootip ||
3570                                      (mp->m_quotainfo &&
3571                                       (ip->i_ino == mp->m_sb.sb_uquotino ||
3572                                        ip->i_ino == mp->m_sb.sb_gquotino)))) {
3573
3574                                         ip = ip->i_mnext;
3575                                         continue;
3576                                 }
3577                                 if (!(flag & XFS_FLUSH_ALL)) {
3578                                         busy = 1;
3579                                         done = 1;
3580                                         break;
3581                                 }
3582                                 /*
3583                                  * Ignore busy inodes but continue flushing
3584                                  * others.
3585                                  */
3586                                 ip = ip->i_mnext;
3587                                 continue;
3588                         }
3589                         /*
3590                          * Sample vp mapping while holding mp locked on MP
3591                          * systems, so we don't purge a reclaimed or
3592                          * nonexistent vnode.  We break from the loop
3593                          * since we know that we modify
3594                          * it by pulling ourselves from it in xfs_reclaim()
3595                          * called via vn_purge() below.  Set ip to the next
3596                          * entry in the list anyway so we'll know below
3597                          * whether we reached the end or not.
3598                          */
3599                         VMAP(vp, vmap);
3600                         XFS_MOUNT_IUNLOCK(mp);
3601
3602                         vn_purge(vp, &vmap);
3603
3604                         purged = 1;
3605                         break;
3606                 } while (ip != mp->m_inodes);
3607                 /*
3608                  * We need to distinguish between when we exit the loop
3609                  * after a purge and when we simply hit the end of the
3610                  * list.  We can't use the (ip == mp->m_inodes) test,
3611                  * because when we purge an inode at the start of the list
3612                  * the next inode on the list becomes mp->m_inodes.  That
3613                  * would cause such a test to bail out early.  The purged
3614                  * variable tells us how we got out of the loop.
3615                  */
3616                 if (!purged) {
3617                         done = 1;
3618                 }
3619         }
3620         XFS_MOUNT_IUNLOCK(mp);
3621         return !busy;
3622 }
3623
3624
3625 /*
3626  * xfs_iaccess: check accessibility of inode for mode.
3627  */
3628 int
3629 xfs_iaccess(
3630         xfs_inode_t     *ip,
3631         mode_t          mode,
3632         cred_t          *cr)
3633 {
3634         xfs_vnode_t     *vp;
3635         int             error;
3636         mode_t          imode;
3637
3638         vp = XFS_ITOV(ip);
3639         imode = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type);
3640
3641         if (mode & S_IWUSR) {
3642                 xfs_mount_t     *mp = ip->i_mount;
3643
3644                 if ((XVFSTOMNT(XFS_MTOVFS(mp))->mnt_flag & MNT_RDONLY) &&
3645                     (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
3646                         return XFS_ERROR(EROFS);
3647
3648 #if XXXKAN
3649                 if (IS_IMMUTABLE(inode))
3650                         return XFS_ERROR(EACCES);
3651 #endif
3652         }
3653
3654         /*
3655          * If there's an Access Control List it's used instead of
3656          * the mode bits.
3657          */
3658         if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
3659                 return error ? XFS_ERROR(error) : 0;
3660
3661
3662         error = vaccess(vp->v_type, imode, ip->i_d.di_uid, ip->i_d.di_gid,
3663             mode, cr, NULL);
3664
3665         return (error);
3666 }
3667
3668 /*
3669  * xfs_iroundup: round up argument to next power of two
3670  */
3671 uint
3672 xfs_iroundup(
3673         uint    v)
3674 {
3675         int i;
3676         uint m;
3677
3678         if ((v & (v - 1)) == 0)
3679                 return v;
3680         ASSERT((v & 0x80000000) == 0);
3681         if ((v & (v + 1)) == 0)
3682                 return v + 1;
3683         for (i = 0, m = 1; i < 31; i++, m <<= 1) {
3684                 if (v & m)
3685                         continue;
3686                 v |= m;
3687                 if ((v & (v + 1)) == 0)
3688                         return v + 1;
3689         }
3690         ASSERT(0);
3691         return( 0 );
3692 }
3693
3694 /*
3695  * Change the requested timestamp in the given inode.
3696  * We don't lock across timestamp updates, and we don't log them but
3697  * we do record the fact that there is dirty information in core.
3698  *
3699  * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
3700  *              with XFS_ICHGTIME_ACC to be sure that access time
3701  *              update will take.  Calling first with XFS_ICHGTIME_ACC
3702  *              and then XFS_ICHGTIME_MOD may fail to modify the access
3703  *              timestamp if the filesystem is mounted noacctm.
3704  */
3705 void
3706 xfs_ichgtime(xfs_inode_t *ip,
3707              int flags)
3708 {
3709         timespec_t      tv;
3710         xfs_vnode_t     *vp = XFS_ITOV(ip);
3711         /*
3712          * We're not supposed to change timestamps in readonly-mounted
3713          * filesystems.  Throw it away if anyone asks us.
3714          */
3715         if (unlikely(vp->v_vfsp->vfs_flag & VFS_RDONLY))
3716                 return;
3717
3718         /*
3719          * Don't update access timestamps on reads if mounted "noatime"
3720          * Throw it away if anyone asks us.
3721          */
3722         if ((ip->i_mount->m_flags & XFS_MOUNT_NOATIME ||
3723             (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)) &&
3724             ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG))
3725                         == XFS_ICHGTIME_ACC))
3726                 return;
3727
3728         nanotime(&tv);
3729         if (flags & XFS_ICHGTIME_MOD) {
3730                 VN_MTIMESET(vp, &tv);
3731                 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
3732         }
3733         if (flags & XFS_ICHGTIME_ACC) {
3734                 VN_ATIMESET(vp, &tv);
3735                 ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
3736         }
3737         if (flags & XFS_ICHGTIME_CHG) {
3738                 VN_CTIMESET(vp, &tv);
3739                 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
3740         }
3741
3742         /*
3743          * We update the i_update_core field _after_ changing
3744          * the timestamps in order to coordinate properly with
3745          * xfs_iflush() so that we don't lose timestamp updates.
3746          * This keeps us from having to hold the inode lock
3747          * while doing this.  We use the SYNCHRONIZE macro to
3748          * ensure that the compiler does not reorder the update
3749          * of i_update_core above the timestamp updates above.
3750          */
3751         SYNCHRONIZE();
3752         ip->i_update_core = 1;
3753 #if XXXKAN
3754         if (!(inode->i_state & I_LOCK))
3755                 mark_inode_dirty_sync(inode);
3756
3757         printf("xfs_ichgtime mark vnode dirty\n");
3758 #endif
3759 }
3760
3761 #ifdef XFS_ILOCK_TRACE
3762 ktrace_t        *xfs_ilock_trace_buf;
3763
3764 void
3765 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3766 {
3767         ktrace_enter(ip->i_lock_trace,
3768                      (void *)ip,
3769                      (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
3770                      (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
3771                      (void *)ra,                /* caller of ilock */
3772                      (void *)(unsigned long)current_cpu(),
3773                      (void *)(unsigned long)current_pid(),
3774                      0,0,0,0,0,0,0,0,0,0);
3775 }
3776 #endif