sys/fs/ext2fs/ext2_readwrite.c

   1 /*-
   2  *  modified for Lites 1.1
   3  *
   4  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
   5  *  University of Utah, Department of Computer Science
   6  */
   7 /*-
   8  * Copyright (c) 1993
   9  *      The Regents of the University of California.  All rights reserved.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)ufs_readwrite.c     8.7 (Berkeley) 1/21/94
  36  * $FreeBSD$
  37  */
  38
  39 /* XXX TODO: remove these obfuscations (as in ffs_vnops.c). */
  40 #define BLKSIZE(a, b, c)        blksize(a, b, c)
  41 #define FS                      struct m_ext2fs
  42 #define I_FS                    i_e2fs
  43 #define READ                    ext2_read
  44 #define READ_S                  "ext2_read"
  45 #define WRITE                   ext2_write
  46 #define WRITE_S                 "ext2_write"
  47
  48 #include <vm/vm.h>
  49 #include <vm/vm_extern.h>
  50 #include <vm/vm_object.h>
  51 #include <vm/vm_page.h>
  52 #include <vm/vm_pager.h>
  53 #include <vm/vnode_pager.h>
  54
  55 #include "opt_directio.h"
  56
  57 /*
  58  * Vnode op for reading.
  59  */
  60 static int
  61 READ(ap)
  62         struct vop_read_args /* {
  63                 struct vnode *a_vp;
  64                 struct uio *a_uio;
  65                 int a_ioflag;
  66                 struct ucred *a_cred;
  67         } */ *ap;
  68 {
  69         struct vnode *vp;
  70         struct inode *ip;
  71         struct uio *uio;
  72         FS *fs;
  73         struct buf *bp;
  74         daddr_t lbn, nextlbn;
  75         off_t bytesinfile;
  76         long size, xfersize, blkoffset;
  77         int error, orig_resid, seqcount;
  78         int ioflag;
  79
  80         vp = ap->a_vp;
  81         uio = ap->a_uio;
  82         ioflag = ap->a_ioflag;
  83
  84         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
  85         ip = VTOI(vp);
  86
  87 #ifdef INVARIANTS
  88         if (uio->uio_rw != UIO_READ)
  89                 panic("%s: mode", READ_S);
  90
  91         if (vp->v_type == VLNK) {
  92                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
  93                         panic("%s: short symlink", READ_S);
  94         } else if (vp->v_type != VREG && vp->v_type != VDIR)
  95                 panic("%s: type %d", READ_S, vp->v_type);
  96 #endif
  97         orig_resid = uio->uio_resid;
  98         KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0"));
  99         if (orig_resid == 0)
 100                 return (0);
 101         KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0"));
 102         fs = ip->I_FS;
 103         if (uio->uio_offset < ip->i_size &&
 104             uio->uio_offset >= fs->e2fs_maxfilesize)
 105                 return (EOVERFLOW);
 106
 107         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 108                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 109                         break;
 110                 lbn = lblkno(fs, uio->uio_offset);
 111                 nextlbn = lbn + 1;
 112                 size = BLKSIZE(fs, ip, lbn);
 113                 blkoffset = blkoff(fs, uio->uio_offset);
 114
 115                 xfersize = fs->e2fs_fsize - blkoffset;
 116                 if (uio->uio_resid < xfersize)
 117                         xfersize = uio->uio_resid;
 118                 if (bytesinfile < xfersize)
 119                         xfersize = bytesinfile;
 120
 121                 if (lblktosize(fs, nextlbn) >= ip->i_size)
 122                         error = bread(vp, lbn, size, NOCRED, &bp);
 123                 else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0)
 124                         error = cluster_read(vp, ip->i_size, lbn, size,
 125                             NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
 126                 else if (seqcount > 1) {
 127                         int nextsize = BLKSIZE(fs, ip, nextlbn);
 128                         error = breadn(vp, lbn,
 129                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 130                 } else
 131                         error = bread(vp, lbn, size, NOCRED, &bp);
 132                 if (error) {
 133                         brelse(bp);
 134                         bp = NULL;
 135                         break;
 136                 }
 137
 138                 /*
 139                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 140                  * will cause us to attempt to release the buffer later on
 141                  * and will cause the buffer cache to attempt to free the
 142                  * underlying pages.
 143                  */
 144                 if (ioflag & IO_DIRECT)
 145                         bp->b_flags |= B_DIRECT;
 146
 147                 /*
 148                  * We should only get non-zero b_resid when an I/O error
 149                  * has occurred, which should cause us to break above.
 150                  * However, if the short read did not cause an error,
 151                  * then we want to ensure that we do not uiomove bad
 152                  * or uninitialized data.
 153                  */
 154                 size -= bp->b_resid;
 155                 if (size < xfersize) {
 156                         if (size == 0)
 157                                 break;
 158                         xfersize = size;
 159                 }
 160                 error = uiomove((char *)bp->b_data + blkoffset,
 161                         (int)xfersize, uio);
 162                 if (error)
 163                         break;
 164
 165                 if (ioflag & (IO_VMIO|IO_DIRECT)) {
 166                         /*
 167                          * If it's VMIO or direct I/O, then we don't
 168                          * need the buf, mark it available for
 169                          * freeing. If it's non-direct VMIO, the VM has
 170                          * the data.
 171                          */
 172                         bp->b_flags |= B_RELBUF;
 173                         brelse(bp);
 174                 } else {
 175                         /*
 176                          * Otherwise let whoever
 177                          * made the request take care of
 178                          * freeing it. We just queue
 179                          * it onto another list.
 180                          */
 181                         bqrelse(bp);
 182                 }
 183         }
 184
 185         /*
 186          * This can only happen in the case of an error
 187          * because the loop above resets bp to NULL on each iteration
 188          * and on normal completion has not set a new value into it.
 189          * so it must have come from a 'break' statement
 190          */
 191         if (bp != NULL) {
 192                 if (ioflag & (IO_VMIO|IO_DIRECT)) {
 193                         bp->b_flags |= B_RELBUF;
 194                         brelse(bp);
 195                 } else {
 196                         bqrelse(bp);
 197                 }
 198         }
 199
 200         if ((error == 0 || uio->uio_resid != orig_resid) &&
 201             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 202                 ip->i_flag |= IN_ACCESS;
 203         return (error);
 204 }
 205
 206 /*
 207  * Vnode op for writing.
 208  */
 209 static int
 210 WRITE(ap)
 211         struct vop_write_args /* {
 212                 struct vnode *a_vp;
 213                 struct uio *a_uio;
 214                 int a_ioflag;
 215                 struct ucred *a_cred;
 216         } */ *ap;
 217 {
 218         struct vnode *vp;
 219         struct uio *uio;
 220         struct inode *ip;
 221         FS *fs;
 222         struct buf *bp;
 223         daddr_t lbn;
 224         off_t osize;
 225         int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize;
 226
 227         ioflag = ap->a_ioflag;
 228         uio = ap->a_uio;
 229         vp = ap->a_vp;
 230
 231         seqcount = ioflag >> IO_SEQSHIFT;
 232         ip = VTOI(vp);
 233
 234 #ifdef INVARIANTS
 235         if (uio->uio_rw != UIO_WRITE)
 236                 panic("%s: mode", WRITE_S);
 237 #endif
 238
 239         switch (vp->v_type) {
 240         case VREG:
 241                 if (ioflag & IO_APPEND)
 242                         uio->uio_offset = ip->i_size;
 243                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 244                         return (EPERM);
 245                 /* FALLTHROUGH */
 246         case VLNK:
 247                 break;
 248         case VDIR:
 249                 /* XXX differs from ffs -- this is called from ext2_mkdir(). */
 250                 if ((ioflag & IO_SYNC) == 0)
 251                 panic("ext2_write: nonsync dir write");
 252                 break;
 253         default:
 254                 panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp,
 255                     vp->v_type, (intmax_t)uio->uio_offset,
 256                     (intmax_t)uio->uio_resid);
 257         }
 258
 259         KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0"));
 260         KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0"));
 261         fs = ip->I_FS;
 262         if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize)
 263                 return (EFBIG);
 264         /*
 265          * Maybe this should be above the vnode op call, but so long as
 266          * file servers have no limits, I don't think it matters.
 267          */
 268         if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 269                 return (EFBIG);
 270
 271         resid = uio->uio_resid;
 272         osize = ip->i_size;
 273         if (seqcount > BA_SEQMAX)
 274                 flags = BA_SEQMAX << BA_SEQSHIFT;
 275         else
 276                 flags = seqcount << BA_SEQSHIFT;
 277         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 278                 flags |= IO_SYNC;
 279
 280         for (error = 0; uio->uio_resid > 0;) {
 281                 lbn = lblkno(fs, uio->uio_offset);
 282                 blkoffset = blkoff(fs, uio->uio_offset);
 283                 xfersize = fs->e2fs_fsize - blkoffset;
 284                 if (uio->uio_resid < xfersize)
 285                         xfersize = uio->uio_resid;
 286                 if (uio->uio_offset + xfersize > ip->i_size)
 287                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 288
 289                 /*
 290                  * We must perform a read-before-write if the transfer size
 291                  * does not cover the entire buffer.
 292                  */
 293                 if (fs->e2fs_bsize > xfersize)
 294                         flags |= BA_CLRBUF;
 295                 else
 296                         flags &= ~BA_CLRBUF;
 297                 error = ext2_balloc(ip, lbn, blkoffset + xfersize,
 298                     ap->a_cred, &bp, flags);
 299                 if (error != 0)
 300                         break;
 301
 302                 /*
 303                  * If the buffer is not valid and we did not clear garbage
 304                  * out above, we have to do so here even though the write
 305                  * covers the entire buffer in order to avoid a mmap()/write
 306                  * race where another process may see the garbage prior to
 307                  * the uiomove() for a write replacing it.
 308                  */
 309                 if ((bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize <= xfersize)
 310                         vfs_bio_clrbuf(bp);
 311                 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 312                         bp->b_flags |= B_NOCACHE;
 313                 if (uio->uio_offset + xfersize > ip->i_size)
 314                         ip->i_size = uio->uio_offset + xfersize;
 315                 size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
 316                 if (size < xfersize)
 317                         xfersize = size;
 318
 319                 error =
 320                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 321                 if (ioflag & (IO_VMIO|IO_DIRECT)) {
 322                         bp->b_flags |= B_RELBUF;
 323                 }
 324
 325                 /*
 326                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 327                  * if we have a severe page deficiency write the buffer
 328                  * asynchronously.  Otherwise try to cluster, and if that
 329                  * doesn't do it then either do an async write (if O_DIRECT),
 330                  * or a delayed write (if not).
 331                  */
 332                 if (ioflag & IO_SYNC) {
 333                         (void)bwrite(bp);
 334                 } else if (vm_page_count_severe() ||
 335                     buf_dirty_count_severe() ||
 336                     (ioflag & IO_ASYNC)) {
 337                         bp->b_flags |= B_CLUSTEROK;
 338                         bawrite(bp);
 339                 } else if (xfersize + blkoffset == fs->e2fs_fsize) {
 340                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 341                                 bp->b_flags |= B_CLUSTEROK;
 342                                 cluster_write(vp, bp, ip->i_size, seqcount);
 343                         } else {
 344                                 bawrite(bp);
 345                         }
 346                 } else if (ioflag & IO_DIRECT) {
 347                         bp->b_flags |= B_CLUSTEROK;
 348                         bawrite(bp);
 349                 } else {
 350                         bp->b_flags |= B_CLUSTEROK;
 351                         bdwrite(bp);
 352                 }
 353                 if (error || xfersize == 0)
 354                         break;
 355         }
 356         /*
 357          * If we successfully wrote any data, and we are not the superuser
 358          * we clear the setuid and setgid bits as a precaution against
 359          * tampering.
 360          */
 361         if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 362             ap->a_cred) {
 363                 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
 364                         ip->i_mode &= ~(ISUID | ISGID);
 365         }
 366         if (error) {
 367                 if (ioflag & IO_UNIT) {
 368                         (void)ext2_truncate(vp, osize,
 369                             ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
 370                         uio->uio_offset -= resid - uio->uio_resid;
 371                         uio->uio_resid = resid;
 372                 }
 373         }
 374         if (uio->uio_resid != resid) {
 375                ip->i_flag |= IN_CHANGE | IN_UPDATE;
 376                if (ioflag & IO_SYNC)
 377                        error = ext2_update(vp, 1);
 378        }
 379         return (error);
 380 }