module/os/linux/zfs/zpl_file.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  23  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  24  */
  25
  26
  27 #ifdef CONFIG_COMPAT
  28 #include <linux/compat.h>
  29 #endif
  30 #include <sys/file.h>
  31 #include <sys/dmu_objset.h>
  32 #include <sys/zfs_vfsops.h>
  33 #include <sys/zfs_vnops.h>
  34 #include <sys/zfs_znode.h>
  35 #include <sys/zfs_project.h>
  36
  37
  38 static int
  39 zpl_open(struct inode *ip, struct file *filp)
  40 {
  41         cred_t *cr = CRED();
  42         int error;
  43         fstrans_cookie_t cookie;
  44
  45         error = generic_file_open(ip, filp);
  46         if (error)
  47                 return (error);
  48
  49         crhold(cr);
  50         cookie = spl_fstrans_mark();
  51         error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
  52         spl_fstrans_unmark(cookie);
  53         crfree(cr);
  54         ASSERT3S(error, <=, 0);
  55
  56         return (error);
  57 }
  58
  59 static int
  60 zpl_release(struct inode *ip, struct file *filp)
  61 {
  62         cred_t *cr = CRED();
  63         int error;
  64         fstrans_cookie_t cookie;
  65
  66         cookie = spl_fstrans_mark();
  67         if (ITOZ(ip)->z_atime_dirty)
  68                 zfs_mark_inode_dirty(ip);
  69
  70         crhold(cr);
  71         error = -zfs_close(ip, filp->f_flags, cr);
  72         spl_fstrans_unmark(cookie);
  73         crfree(cr);
  74         ASSERT3S(error, <=, 0);
  75
  76         return (error);
  77 }
  78
  79 static int
  80 zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
  81 {
  82         cred_t *cr = CRED();
  83         int error;
  84         fstrans_cookie_t cookie;
  85
  86         crhold(cr);
  87         cookie = spl_fstrans_mark();
  88         error = -zfs_readdir(file_inode(filp), ctx, cr);
  89         spl_fstrans_unmark(cookie);
  90         crfree(cr);
  91         ASSERT3S(error, <=, 0);
  92
  93         return (error);
  94 }
  95
  96 #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
  97 static int
  98 zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
  99 {
 100         zpl_dir_context_t ctx =
 101             ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
 102         int error;
 103
 104         error = zpl_iterate(filp, &ctx);
 105         filp->f_pos = ctx.pos;
 106
 107         return (error);
 108 }
 109 #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
 110
 111 #if defined(HAVE_FSYNC_WITH_DENTRY)
 112 /*
 113  * Linux 2.6.x - 2.6.34 API,
 114  * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *'
 115  * to the fops->fsync() hook.  For this reason, we must be careful not to
 116  * use filp unconditionally.
 117  */
 118 static int
 119 zpl_fsync(struct file *filp, struct dentry *dentry, int datasync)
 120 {
 121         cred_t *cr = CRED();
 122         int error;
 123         fstrans_cookie_t cookie;
 124
 125         crhold(cr);
 126         cookie = spl_fstrans_mark();
 127         error = -zfs_fsync(dentry->d_inode, datasync, cr);
 128         spl_fstrans_unmark(cookie);
 129         crfree(cr);
 130         ASSERT3S(error, <=, 0);
 131
 132         return (error);
 133 }
 134
 135 #ifdef HAVE_FILE_AIO_FSYNC
 136 static int
 137 zpl_aio_fsync(struct kiocb *kiocb, int datasync)
 138 {
 139         struct file *filp = kiocb->ki_filp;
 140         return (zpl_fsync(filp, file_dentry(filp), datasync));
 141 }
 142 #endif
 143
 144 #elif defined(HAVE_FSYNC_WITHOUT_DENTRY)
 145 /*
 146  * Linux 2.6.35 - 3.0 API,
 147  * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
 148  * redundant.  The dentry is still accessible via filp->f_path.dentry,
 149  * and we are guaranteed that filp will never be NULL.
 150  */
 151 static int
 152 zpl_fsync(struct file *filp, int datasync)
 153 {
 154         struct inode *inode = filp->f_mapping->host;
 155         cred_t *cr = CRED();
 156         int error;
 157         fstrans_cookie_t cookie;
 158
 159         crhold(cr);
 160         cookie = spl_fstrans_mark();
 161         error = -zfs_fsync(inode, datasync, cr);
 162         spl_fstrans_unmark(cookie);
 163         crfree(cr);
 164         ASSERT3S(error, <=, 0);
 165
 166         return (error);
 167 }
 168
 169 #ifdef HAVE_FILE_AIO_FSYNC
 170 static int
 171 zpl_aio_fsync(struct kiocb *kiocb, int datasync)
 172 {
 173         return (zpl_fsync(kiocb->ki_filp, datasync));
 174 }
 175 #endif
 176
 177 #elif defined(HAVE_FSYNC_RANGE)
 178 /*
 179  * Linux 3.1 - 3.x API,
 180  * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
 181  * been pushed down in to the .fsync() vfs hook.  Additionally, the i_mutex
 182  * lock is no longer held by the caller, for zfs we don't require the lock
 183  * to be held so we don't acquire it.
 184  */
 185 static int
 186 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 187 {
 188         struct inode *inode = filp->f_mapping->host;
 189         cred_t *cr = CRED();
 190         int error;
 191         fstrans_cookie_t cookie;
 192
 193         error = filemap_write_and_wait_range(inode->i_mapping, start, end);
 194         if (error)
 195                 return (error);
 196
 197         crhold(cr);
 198         cookie = spl_fstrans_mark();
 199         error = -zfs_fsync(inode, datasync, cr);
 200         spl_fstrans_unmark(cookie);
 201         crfree(cr);
 202         ASSERT3S(error, <=, 0);
 203
 204         return (error);
 205 }
 206
 207 #ifdef HAVE_FILE_AIO_FSYNC
 208 static int
 209 zpl_aio_fsync(struct kiocb *kiocb, int datasync)
 210 {
 211         return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
 212 }
 213 #endif
 214
 215 #else
 216 #error "Unsupported fops->fsync() implementation"
 217 #endif
 218
 219 static inline int
 220 zfs_io_flags(struct kiocb *kiocb)
 221 {
 222         int flags = 0;
 223
 224 #if defined(IOCB_DSYNC)
 225         if (kiocb->ki_flags & IOCB_DSYNC)
 226                 flags |= FDSYNC;
 227 #endif
 228 #if defined(IOCB_SYNC)
 229         if (kiocb->ki_flags & IOCB_SYNC)
 230                 flags |= FSYNC;
 231 #endif
 232 #if defined(IOCB_APPEND)
 233         if (kiocb->ki_flags & IOCB_APPEND)
 234                 flags |= FAPPEND;
 235 #endif
 236 #if defined(IOCB_DIRECT)
 237         if (kiocb->ki_flags & IOCB_DIRECT)
 238                 flags |= FDIRECT;
 239 #endif
 240         return (flags);
 241 }
 242
 243 static ssize_t
 244 zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
 245     unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
 246     cred_t *cr, size_t skip)
 247 {
 248         ssize_t read;
 249         uio_t uio = { { 0 }, 0 };
 250         int error;
 251         fstrans_cookie_t cookie;
 252
 253         uio.uio_iov = iovp;
 254         uio.uio_iovcnt = nr_segs;
 255         uio.uio_loffset = *ppos;
 256         uio.uio_segflg = segment;
 257         uio.uio_limit = MAXOFFSET_T;
 258         uio.uio_resid = count;
 259         uio.uio_skip = skip;
 260
 261         cookie = spl_fstrans_mark();
 262         error = -zfs_read(ip, &uio, flags, cr);
 263         spl_fstrans_unmark(cookie);
 264         if (error < 0)
 265                 return (error);
 266
 267         read = count - uio.uio_resid;
 268         *ppos += read;
 269
 270         return (read);
 271 }
 272
 273 inline ssize_t
 274 zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
 275     uio_seg_t segment, int flags, cred_t *cr)
 276 {
 277         struct iovec iov;
 278
 279         iov.iov_base = (void *)buf;
 280         iov.iov_len = len;
 281
 282         return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
 283             flags, cr, 0));
 284 }
 285
 286 static ssize_t
 287 zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
 288     unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
 289 {
 290         cred_t *cr = CRED();
 291         struct file *filp = kiocb->ki_filp;
 292         struct inode *ip = filp->f_mapping->host;
 293         zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip));
 294         ssize_t read;
 295         unsigned int f_flags = filp->f_flags;
 296
 297         f_flags |= zfs_io_flags(kiocb);
 298         crhold(cr);
 299         read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
 300             nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
 301         crfree(cr);
 302
 303         /*
 304          * If relatime is enabled, call file_accessed() only if
 305          * zfs_relatime_need_update() is true.  This is needed since datasets
 306          * with inherited "relatime" property aren't necessarily mounted with
 307          * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what
 308          * relatime test in VFS by relatime_need_update() is based on.
 309          */
 310         if (!IS_NOATIME(ip) && zfsvfs->z_relatime) {
 311                 if (zfs_relatime_need_update(ip))
 312                         file_accessed(filp);
 313         } else {
 314                 file_accessed(filp);
 315         }
 316
 317         return (read);
 318 }
 319
 320 #if defined(HAVE_VFS_RW_ITERATE)
 321 static ssize_t
 322 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 323 {
 324         ssize_t ret;
 325         uio_seg_t seg = UIO_USERSPACE;
 326         if (to->type & ITER_KVEC)
 327                 seg = UIO_SYSSPACE;
 328         if (to->type & ITER_BVEC)
 329                 seg = UIO_BVEC;
 330         ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
 331             iov_iter_count(to), seg, to->iov_offset);
 332         if (ret > 0)
 333                 iov_iter_advance(to, ret);
 334         return (ret);
 335 }
 336 #else
 337 static ssize_t
 338 zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
 339     unsigned long nr_segs, loff_t pos)
 340 {
 341         ssize_t ret;
 342         size_t count;
 343
 344         ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE);
 345         if (ret)
 346                 return (ret);
 347
 348         return (zpl_iter_read_common(kiocb, iovp, nr_segs, count,
 349             UIO_USERSPACE, 0));
 350 }
 351 #endif /* HAVE_VFS_RW_ITERATE */
 352
 353 static ssize_t
 354 zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
 355     unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
 356     cred_t *cr, size_t skip)
 357 {
 358         ssize_t wrote;
 359         uio_t uio = { { 0 }, 0 };
 360         int error;
 361         fstrans_cookie_t cookie;
 362
 363         if (flags & O_APPEND)
 364                 *ppos = i_size_read(ip);
 365
 366         uio.uio_iov = iovp;
 367         uio.uio_iovcnt = nr_segs;
 368         uio.uio_loffset = *ppos;
 369         uio.uio_segflg = segment;
 370         uio.uio_limit = MAXOFFSET_T;
 371         uio.uio_resid = count;
 372         uio.uio_skip = skip;
 373
 374         cookie = spl_fstrans_mark();
 375         error = -zfs_write(ip, &uio, flags, cr);
 376         spl_fstrans_unmark(cookie);
 377         if (error < 0)
 378                 return (error);
 379
 380         wrote = count - uio.uio_resid;
 381         *ppos += wrote;
 382
 383         return (wrote);
 384 }
 385
 386 inline ssize_t
 387 zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
 388     uio_seg_t segment, int flags, cred_t *cr)
 389 {
 390         struct iovec iov;
 391
 392         iov.iov_base = (void *)buf;
 393         iov.iov_len = len;
 394
 395         return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
 396             flags, cr, 0));
 397 }
 398
 399 static ssize_t
 400 zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
 401     unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
 402 {
 403         cred_t *cr = CRED();
 404         struct file *filp = kiocb->ki_filp;
 405         ssize_t wrote;
 406         unsigned int f_flags = filp->f_flags;
 407
 408         f_flags |= zfs_io_flags(kiocb);
 409         crhold(cr);
 410         wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
 411             nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
 412         crfree(cr);
 413
 414         return (wrote);
 415 }
 416
 417 #if defined(HAVE_VFS_RW_ITERATE)
 418 static ssize_t
 419 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 420 {
 421         size_t count;
 422         ssize_t ret;
 423         uio_seg_t seg = UIO_USERSPACE;
 424
 425 #ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
 426         struct file *file = kiocb->ki_filp;
 427         struct address_space *mapping = file->f_mapping;
 428         struct inode *ip = mapping->host;
 429         int isblk = S_ISBLK(ip->i_mode);
 430
 431         count = iov_iter_count(from);
 432         ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
 433         if (ret)
 434                 return (ret);
 435 #else
 436         /*
 437          * XXX - ideally this check should be in the same lock region with
 438          * write operations, so that there's no TOCTTOU race when doing
 439          * append and someone else grow the file.
 440          */
 441         ret = generic_write_checks(kiocb, from);
 442         if (ret <= 0)
 443                 return (ret);
 444         count = ret;
 445 #endif
 446
 447         if (from->type & ITER_KVEC)
 448                 seg = UIO_SYSSPACE;
 449         if (from->type & ITER_BVEC)
 450                 seg = UIO_BVEC;
 451
 452         ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
 453             count, seg, from->iov_offset);
 454         if (ret > 0)
 455                 iov_iter_advance(from, ret);
 456
 457         return (ret);
 458 }
 459 #else
 460 static ssize_t
 461 zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
 462     unsigned long nr_segs, loff_t pos)
 463 {
 464         struct file *file = kiocb->ki_filp;
 465         struct address_space *mapping = file->f_mapping;
 466         struct inode *ip = mapping->host;
 467         int isblk = S_ISBLK(ip->i_mode);
 468         size_t count;
 469         ssize_t ret;
 470
 471         ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ);
 472         if (ret)
 473                 return (ret);
 474
 475         ret = generic_write_checks(file, &pos, &count, isblk);
 476         if (ret)
 477                 return (ret);
 478
 479         return (zpl_iter_write_common(kiocb, iovp, nr_segs, count,
 480             UIO_USERSPACE, 0));
 481 }
 482 #endif /* HAVE_VFS_RW_ITERATE */
 483
 484 #if defined(HAVE_VFS_RW_ITERATE)
 485 static ssize_t
 486 zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
 487 {
 488         if (rw == WRITE)
 489                 return (zpl_iter_write(kiocb, iter));
 490         else
 491                 return (zpl_iter_read(kiocb, iter));
 492 }
 493 #if defined(HAVE_VFS_DIRECT_IO_ITER)
 494 static ssize_t
 495 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
 496 {
 497         return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
 498 }
 499 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
 500 static ssize_t
 501 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 502 {
 503         ASSERT3S(pos, ==, kiocb->ki_pos);
 504         return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
 505 }
 506 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
 507 static ssize_t
 508 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 509 {
 510         ASSERT3S(pos, ==, kiocb->ki_pos);
 511         return (zpl_direct_IO_impl(rw, kiocb, iter));
 512 }
 513 #else
 514 #error "Unknown direct IO interface"
 515 #endif
 516
 517 #else
 518
 519 #if defined(HAVE_VFS_DIRECT_IO_IOVEC)
 520 static ssize_t
 521 zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp,
 522     loff_t pos, unsigned long nr_segs)
 523 {
 524         if (rw == WRITE)
 525                 return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
 526         else
 527                 return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
 528 }
 529 #else
 530 #error "Unknown direct IO interface"
 531 #endif
 532
 533 #endif /* HAVE_VFS_RW_ITERATE */
 534
 535 static loff_t
 536 zpl_llseek(struct file *filp, loff_t offset, int whence)
 537 {
 538 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 539         fstrans_cookie_t cookie;
 540
 541         if (whence == SEEK_DATA || whence == SEEK_HOLE) {
 542                 struct inode *ip = filp->f_mapping->host;
 543                 loff_t maxbytes = ip->i_sb->s_maxbytes;
 544                 loff_t error;
 545
 546                 spl_inode_lock_shared(ip);
 547                 cookie = spl_fstrans_mark();
 548                 error = -zfs_holey(ip, whence, &offset);
 549                 spl_fstrans_unmark(cookie);
 550                 if (error == 0)
 551                         error = lseek_execute(filp, ip, offset, maxbytes);
 552                 spl_inode_unlock_shared(ip);
 553
 554                 return (error);
 555         }
 556 #endif /* SEEK_HOLE && SEEK_DATA */
 557
 558         return (generic_file_llseek(filp, offset, whence));
 559 }
 560
 561 /*
 562  * It's worth taking a moment to describe how mmap is implemented
 563  * for zfs because it differs considerably from other Linux filesystems.
 564  * However, this issue is handled the same way under OpenSolaris.
 565  *
 566  * The issue is that by design zfs bypasses the Linux page cache and
 567  * leaves all caching up to the ARC.  This has been shown to work
 568  * well for the common read(2)/write(2) case.  However, mmap(2)
 569  * is problem because it relies on being tightly integrated with the
 570  * page cache.  To handle this we cache mmap'ed files twice, once in
 571  * the ARC and a second time in the page cache.  The code is careful
 572  * to keep both copies synchronized.
 573  *
 574  * When a file with an mmap'ed region is written to using write(2)
 575  * both the data in the ARC and existing pages in the page cache
 576  * are updated.  For a read(2) data will be read first from the page
 577  * cache then the ARC if needed.  Neither a write(2) or read(2) will
 578  * will ever result in new pages being added to the page cache.
 579  *
 580  * New pages are added to the page cache only via .readpage() which
 581  * is called when the vfs needs to read a page off disk to back the
 582  * virtual memory region.  These pages may be modified without
 583  * notifying the ARC and will be written out periodically via
 584  * .writepage().  This will occur due to either a sync or the usual
 585  * page aging behavior.  Note because a read(2) of a mmap'ed file
 586  * will always check the page cache first even when the ARC is out
 587  * of date correct data will still be returned.
 588  *
 589  * While this implementation ensures correct behavior it does have
 590  * have some drawbacks.  The most obvious of which is that it
 591  * increases the required memory footprint when access mmap'ed
 592  * files.  It also adds additional complexity to the code keeping
 593  * both caches synchronized.
 594  *
 595  * Longer term it may be possible to cleanly resolve this wart by
 596  * mapping page cache pages directly on to the ARC buffers.  The
 597  * Linux address space operations are flexible enough to allow
 598  * selection of which pages back a particular index.  The trick
 599  * would be working out the details of which subsystem is in
 600  * charge, the ARC, the page cache, or both.  It may also prove
 601  * helpful to move the ARC buffers to a scatter-gather lists
 602  * rather than a vmalloc'ed region.
 603  */
 604 static int
 605 zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 606 {
 607         struct inode *ip = filp->f_mapping->host;
 608         znode_t *zp = ITOZ(ip);
 609         int error;
 610         fstrans_cookie_t cookie;
 611
 612         cookie = spl_fstrans_mark();
 613         error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
 614             (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
 615         spl_fstrans_unmark(cookie);
 616         if (error)
 617                 return (error);
 618
 619         error = generic_file_mmap(filp, vma);
 620         if (error)
 621                 return (error);
 622
 623         mutex_enter(&zp->z_lock);
 624         zp->z_is_mapped = B_TRUE;
 625         mutex_exit(&zp->z_lock);
 626
 627         return (error);
 628 }
 629
 630 /*
 631  * Populate a page with data for the Linux page cache.  This function is
 632  * only used to support mmap(2).  There will be an identical copy of the
 633  * data in the ARC which is kept up to date via .write() and .writepage().
 634  *
 635  * Current this function relies on zpl_read_common() and the O_DIRECT
 636  * flag to read in a page.  This works but the more correct way is to
 637  * update zfs_fillpage() to be Linux friendly and use that interface.
 638  */
 639 static int
 640 zpl_readpage(struct file *filp, struct page *pp)
 641 {
 642         struct inode *ip;
 643         struct page *pl[1];
 644         int error = 0;
 645         fstrans_cookie_t cookie;
 646
 647         ASSERT(PageLocked(pp));
 648         ip = pp->mapping->host;
 649         pl[0] = pp;
 650
 651         cookie = spl_fstrans_mark();
 652         error = -zfs_getpage(ip, pl, 1);
 653         spl_fstrans_unmark(cookie);
 654
 655         if (error) {
 656                 SetPageError(pp);
 657                 ClearPageUptodate(pp);
 658         } else {
 659                 ClearPageError(pp);
 660                 SetPageUptodate(pp);
 661                 flush_dcache_page(pp);
 662         }
 663
 664         unlock_page(pp);
 665         return (error);
 666 }
 667
 668 /*
 669  * Populate a set of pages with data for the Linux page cache.  This
 670  * function will only be called for read ahead and never for demand
 671  * paging.  For simplicity, the code relies on read_cache_pages() to
 672  * correctly lock each page for IO and call zpl_readpage().
 673  */
 674 static int
 675 zpl_readpages(struct file *filp, struct address_space *mapping,
 676     struct list_head *pages, unsigned nr_pages)
 677 {
 678         return (read_cache_pages(mapping, pages,
 679             (filler_t *)zpl_readpage, filp));
 680 }
 681
 682 int
 683 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 684 {
 685         struct address_space *mapping = data;
 686         fstrans_cookie_t cookie;
 687
 688         ASSERT(PageLocked(pp));
 689         ASSERT(!PageWriteback(pp));
 690
 691         cookie = spl_fstrans_mark();
 692         (void) zfs_putpage(mapping->host, pp, wbc);
 693         spl_fstrans_unmark(cookie);
 694
 695         return (0);
 696 }
 697
 698 static int
 699 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 700 {
 701         znode_t         *zp = ITOZ(mapping->host);
 702         zfsvfs_t        *zfsvfs = ITOZSB(mapping->host);
 703         enum writeback_sync_modes sync_mode;
 704         int result;
 705
 706         ZFS_ENTER(zfsvfs);
 707         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 708                 wbc->sync_mode = WB_SYNC_ALL;
 709         ZFS_EXIT(zfsvfs);
 710         sync_mode = wbc->sync_mode;
 711
 712         /*
 713          * We don't want to run write_cache_pages() in SYNC mode here, because
 714          * that would make putpage() wait for a single page to be committed to
 715          * disk every single time, resulting in atrocious performance. Instead
 716          * we run it once in non-SYNC mode so that the ZIL gets all the data,
 717          * and then we commit it all in one go.
 718          */
 719         wbc->sync_mode = WB_SYNC_NONE;
 720         result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
 721         if (sync_mode != wbc->sync_mode) {
 722                 ZFS_ENTER(zfsvfs);
 723                 ZFS_VERIFY_ZP(zp);
 724                 if (zfsvfs->z_log != NULL)
 725                         zil_commit(zfsvfs->z_log, zp->z_id);
 726                 ZFS_EXIT(zfsvfs);
 727
 728                 /*
 729                  * We need to call write_cache_pages() again (we can't just
 730                  * return after the commit) because the previous call in
 731                  * non-SYNC mode does not guarantee that we got all the dirty
 732                  * pages (see the implementation of write_cache_pages() for
 733                  * details). That being said, this is a no-op in most cases.
 734                  */
 735                 wbc->sync_mode = sync_mode;
 736                 result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
 737         }
 738         return (result);
 739 }
 740
 741 /*
 742  * Write out dirty pages to the ARC, this function is only required to
 743  * support mmap(2).  Mapped pages may be dirtied by memory operations
 744  * which never call .write().  These dirty pages are kept in sync with
 745  * the ARC buffers via this hook.
 746  */
 747 static int
 748 zpl_writepage(struct page *pp, struct writeback_control *wbc)
 749 {
 750         if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
 751                 wbc->sync_mode = WB_SYNC_ALL;
 752
 753         return (zpl_putpage(pp, wbc, pp->mapping));
 754 }
 755
 756 /*
 757  * The only flag combination which matches the behavior of zfs_space()
 758  * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
 759  * flag was introduced in the 2.6.38 kernel.
 760  */
 761 #if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE)
 762 long
 763 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
 764 {
 765         int error = -EOPNOTSUPP;
 766
 767 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
 768         cred_t *cr = CRED();
 769         flock64_t bf;
 770         loff_t olen;
 771         fstrans_cookie_t cookie;
 772
 773         if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 774                 return (error);
 775
 776         if (offset < 0 || len <= 0)
 777                 return (-EINVAL);
 778
 779         spl_inode_lock(ip);
 780         olen = i_size_read(ip);
 781
 782         if (offset > olen) {
 783                 spl_inode_unlock(ip);
 784                 return (0);
 785         }
 786         if (offset + len > olen)
 787                 len = olen - offset;
 788         bf.l_type = F_WRLCK;
 789         bf.l_whence = SEEK_SET;
 790         bf.l_start = offset;
 791         bf.l_len = len;
 792         bf.l_pid = 0;
 793
 794         crhold(cr);
 795         cookie = spl_fstrans_mark();
 796         error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr);
 797         spl_fstrans_unmark(cookie);
 798         spl_inode_unlock(ip);
 799
 800         crfree(cr);
 801 #endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */
 802
 803         ASSERT3S(error, <=, 0);
 804         return (error);
 805 }
 806 #endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */
 807
 808 #ifdef HAVE_FILE_FALLOCATE
 809 static long
 810 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
 811 {
 812         return zpl_fallocate_common(file_inode(filp),
 813             mode, offset, len);
 814 }
 815 #endif /* HAVE_FILE_FALLOCATE */
 816
 817 #define ZFS_FL_USER_VISIBLE     (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
 818 #define ZFS_FL_USER_MODIFIABLE  (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
 819
 820 static uint32_t
 821 __zpl_ioctl_getflags(struct inode *ip)
 822 {
 823         uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 824         uint32_t ioctl_flags = 0;
 825
 826         if (zfs_flags & ZFS_IMMUTABLE)
 827                 ioctl_flags |= FS_IMMUTABLE_FL;
 828
 829         if (zfs_flags & ZFS_APPENDONLY)
 830                 ioctl_flags |= FS_APPEND_FL;
 831
 832         if (zfs_flags & ZFS_NODUMP)
 833                 ioctl_flags |= FS_NODUMP_FL;
 834
 835         if (zfs_flags & ZFS_PROJINHERIT)
 836                 ioctl_flags |= ZFS_PROJINHERIT_FL;
 837
 838         return (ioctl_flags & ZFS_FL_USER_VISIBLE);
 839 }
 840
 841 /*
 842  * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
 843  * attributes common to both Linux and Solaris are mapped.
 844  */
 845 static int
 846 zpl_ioctl_getflags(struct file *filp, void __user *arg)
 847 {
 848         uint32_t flags;
 849         int err;
 850
 851         flags = __zpl_ioctl_getflags(file_inode(filp));
 852         err = copy_to_user(arg, &flags, sizeof (flags));
 853
 854         return (err);
 855 }
 856
 857 /*
 858  * fchange() is a helper macro to detect if we have been asked to change a
 859  * flag. This is ugly, but the requirement that we do this is a consequence of
 860  * how the Linux file attribute interface was designed. Another consequence is
 861  * that concurrent modification of files suffers from a TOCTOU race. Neither
 862  * are things we can fix without modifying the kernel-userland interface, which
 863  * is outside of our jurisdiction.
 864  */
 865
 866 #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
 867
 868 static int
 869 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
 870 {
 871         uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 872         xoptattr_t *xoap;
 873
 874         if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
 875             ZFS_PROJINHERIT_FL))
 876                 return (-EOPNOTSUPP);
 877
 878         if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
 879                 return (-EACCES);
 880
 881         if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
 882             fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
 883             !capable(CAP_LINUX_IMMUTABLE))
 884                 return (-EACCES);
 885
 886         if (!zpl_inode_owner_or_capable(ip))
 887                 return (-EACCES);
 888
 889         xva_init(xva);
 890         xoap = xva_getxoptattr(xva);
 891
 892         XVA_SET_REQ(xva, XAT_IMMUTABLE);
 893         if (ioctl_flags & FS_IMMUTABLE_FL)
 894                 xoap->xoa_immutable = B_TRUE;
 895
 896         XVA_SET_REQ(xva, XAT_APPENDONLY);
 897         if (ioctl_flags & FS_APPEND_FL)
 898                 xoap->xoa_appendonly = B_TRUE;
 899
 900         XVA_SET_REQ(xva, XAT_NODUMP);
 901         if (ioctl_flags & FS_NODUMP_FL)
 902                 xoap->xoa_nodump = B_TRUE;
 903
 904         XVA_SET_REQ(xva, XAT_PROJINHERIT);
 905         if (ioctl_flags & ZFS_PROJINHERIT_FL)
 906                 xoap->xoa_projinherit = B_TRUE;
 907
 908         return (0);
 909 }
 910
 911 static int
 912 zpl_ioctl_setflags(struct file *filp, void __user *arg)
 913 {
 914         struct inode *ip = file_inode(filp);
 915         uint32_t flags;
 916         cred_t *cr = CRED();
 917         xvattr_t xva;
 918         int err;
 919         fstrans_cookie_t cookie;
 920
 921         if (copy_from_user(&flags, arg, sizeof (flags)))
 922                 return (-EFAULT);
 923
 924         err = __zpl_ioctl_setflags(ip, flags, &xva);
 925         if (err)
 926                 return (err);
 927
 928         crhold(cr);
 929         cookie = spl_fstrans_mark();
 930         err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr);
 931         spl_fstrans_unmark(cookie);
 932         crfree(cr);
 933
 934         return (err);
 935 }
 936
 937 static int
 938 zpl_ioctl_getxattr(struct file *filp, void __user *arg)
 939 {
 940         zfsxattr_t fsx = { 0 };
 941         struct inode *ip = file_inode(filp);
 942         int err;
 943
 944         fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
 945         fsx.fsx_projid = ITOZ(ip)->z_projid;
 946         err = copy_to_user(arg, &fsx, sizeof (fsx));
 947
 948         return (err);
 949 }
 950
 951 static int
 952 zpl_ioctl_setxattr(struct file *filp, void __user *arg)
 953 {
 954         struct inode *ip = file_inode(filp);
 955         zfsxattr_t fsx;
 956         cred_t *cr = CRED();
 957         xvattr_t xva;
 958         xoptattr_t *xoap;
 959         int err;
 960         fstrans_cookie_t cookie;
 961
 962         if (copy_from_user(&fsx, arg, sizeof (fsx)))
 963                 return (-EFAULT);
 964
 965         if (!zpl_is_valid_projid(fsx.fsx_projid))
 966                 return (-EINVAL);
 967
 968         err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
 969         if (err)
 970                 return (err);
 971
 972         xoap = xva_getxoptattr(&xva);
 973         XVA_SET_REQ(&xva, XAT_PROJID);
 974         xoap->xoa_projid = fsx.fsx_projid;
 975
 976         crhold(cr);
 977         cookie = spl_fstrans_mark();
 978         err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr);
 979         spl_fstrans_unmark(cookie);
 980         crfree(cr);
 981
 982         return (err);
 983 }
 984
 985 static long
 986 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 987 {
 988         switch (cmd) {
 989         case FS_IOC_GETFLAGS:
 990                 return (zpl_ioctl_getflags(filp, (void *)arg));
 991         case FS_IOC_SETFLAGS:
 992                 return (zpl_ioctl_setflags(filp, (void *)arg));
 993         case ZFS_IOC_FSGETXATTR:
 994                 return (zpl_ioctl_getxattr(filp, (void *)arg));
 995         case ZFS_IOC_FSSETXATTR:
 996                 return (zpl_ioctl_setxattr(filp, (void *)arg));
 997         default:
 998                 return (-ENOTTY);
 999         }
1000 }
1001
1002 #ifdef CONFIG_COMPAT
1003 static long
1004 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1005 {
1006         switch (cmd) {
1007         case FS_IOC32_GETFLAGS:
1008                 cmd = FS_IOC_GETFLAGS;
1009                 break;
1010         case FS_IOC32_SETFLAGS:
1011                 cmd = FS_IOC_SETFLAGS;
1012                 break;
1013         default:
1014                 return (-ENOTTY);
1015         }
1016         return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
1017 }
1018 #endif /* CONFIG_COMPAT */
1019
1020
1021 const struct address_space_operations zpl_address_space_operations = {
1022         .readpages      = zpl_readpages,
1023         .readpage       = zpl_readpage,
1024         .writepage      = zpl_writepage,
1025         .writepages     = zpl_writepages,
1026         .direct_IO      = zpl_direct_IO,
1027 };
1028
1029 const struct file_operations zpl_file_operations = {
1030         .open           = zpl_open,
1031         .release        = zpl_release,
1032         .llseek         = zpl_llseek,
1033 #ifdef HAVE_VFS_RW_ITERATE
1034 #ifdef HAVE_NEW_SYNC_READ
1035         .read           = new_sync_read,
1036         .write          = new_sync_write,
1037 #endif
1038         .read_iter      = zpl_iter_read,
1039         .write_iter     = zpl_iter_write,
1040 #else
1041         .read           = do_sync_read,
1042         .write          = do_sync_write,
1043         .aio_read       = zpl_aio_read,
1044         .aio_write      = zpl_aio_write,
1045 #endif
1046         .mmap           = zpl_mmap,
1047         .fsync          = zpl_fsync,
1048 #ifdef HAVE_FILE_AIO_FSYNC
1049         .aio_fsync      = zpl_aio_fsync,
1050 #endif
1051 #ifdef HAVE_FILE_FALLOCATE
1052         .fallocate      = zpl_fallocate,
1053 #endif /* HAVE_FILE_FALLOCATE */
1054         .unlocked_ioctl = zpl_ioctl,
1055 #ifdef CONFIG_COMPAT
1056         .compat_ioctl   = zpl_compat_ioctl,
1057 #endif
1058 };
1059
1060 const struct file_operations zpl_dir_file_operations = {
1061         .llseek         = generic_file_llseek,
1062         .read           = generic_read_dir,
1063 #if defined(HAVE_VFS_ITERATE_SHARED)
1064         .iterate_shared = zpl_iterate,
1065 #elif defined(HAVE_VFS_ITERATE)
1066         .iterate        = zpl_iterate,
1067 #else
1068         .readdir        = zpl_readdir,
1069 #endif
1070         .fsync          = zpl_fsync,
1071         .unlocked_ioctl = zpl_ioctl,
1072 #ifdef CONFIG_COMPAT
1073         .compat_ioctl   = zpl_compat_ioctl,
1074 #endif
1075 };