From 89cd2197b94986d315b9b1be707b645baf59af4f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 23 Feb 2023 10:57:24 -0800 Subject: [PATCH] Fix buffered/direct/mmap I/O race When a page is faulted in for memory mapped I/O the page lock may be dropped before it has been read and marked up to date. If a buffered read encounters such a page in mappedread() it must wait until the page has been updated. Failure to do so will result in a panic on debug builds and incorrect data on production builds. The critical part of this change is in mappedread() where pages which are not up to date are now handled. Additionally, it includes the following simplifications. - zfs_getpage() and zfs_fillpage() could be passed an array of pages. This could be more efficient if it was used but in practice only a single page was ever provided. These interfaces were simplified to acknowledge that. - update_pages() was modified to correctly set the PG_error bit on a page when it cannot be read by dmu_read(). - Setting PG_error and PG_uptodate was moved to zfs_fillpage() from zpl_readpage_common(). This is consistent with the handling in update_pages() and mappedread(). - Minor additional refactoring to comments and variable declarations to improve readability. - Add a test case to exercise concurrent buffered, direct, and mmap IO to the same file. - Reduce the mmap_sync test case default run time. Reviewed-by: Richard Yao Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #13608 Closes #14498 --- include/os/linux/zfs/sys/zfs_vnops_os.h | 2 +- module/os/linux/zfs/zfs_vnops_os.c | 169 +++++++++--------- module/os/linux/zfs/zpl_file.c | 17 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/cmd/mmap_sync.c | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/mmap/mmap_mixed.ksh | 86 +++++++++ 7 files changed, 175 insertions(+), 105 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 197ea9bec50..1caec0ef4f1 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -72,7 +72,7 @@ extern void zfs_inactive(struct inode *ip); extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_fid(struct inode *ip, fid_t *fidp); -extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); +extern int zfs_getpage(struct inode *ip, struct page *pp); extern int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync); extern int zfs_dirty_inode(struct inode *ip, int flags); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 302a88c2d35..9904807faa9 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -220,43 +220,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) } #if defined(_KERNEL) + +static int zfs_fillpage(struct inode *ip, struct page *pp); + /* * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. + * between the DMU cache and the memory mapped pages. Update all mapped + * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { - struct inode *ip = ZTOI(zp); - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; + struct address_space *mp = ZTOI(zp)->i_mapping; + int64_t off = start & (PAGE_SIZE - 1); - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); + uint64_t nbytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); - pb = kmap(pp); - (void) dmu_read(os, zp->z_id, start + off, nbytes, - pb + off, DMU_READ_PREFETCH); + void *pb = kmap(pp); + int error = dmu_read(os, zp->z_id, start + off, + nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + } - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); unlock_page(pp); put_page(pp); } @@ -267,38 +270,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) } /* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. + * When a file is memory mapped, we must keep the I/O data synchronized + * between the DMU cache and the memory mapped pages. Preferentially read + * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; - struct page *pp; - int64_t start, off; - uint64_t bytes; + int64_t start = uio->uio_loffset; + int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; - void *pb; - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); + uint64_t bytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { - ASSERT(PageUptodate(pp)); + /* + * If filemap_fault() retries there exists a window + * where the page will be unlocked and not up to date. + * In this case we must try and fill the page. + */ + if (unlikely(!PageUptodate(pp))) { + error = zfs_fillpage(ip, pp); + if (error) { + unlock_page(pp); + put_page(pp); + return (error); + } + } + + ASSERT(PageUptodate(pp) || PageDirty(pp)); + unlock_page(pp); - pb = kmap(pp); + void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); @@ -314,9 +323,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) len -= bytes; off = 0; + if (error) break; } + return (error); } #endif /* _KERNEL */ @@ -3959,55 +3970,41 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_fillpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; - - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; if (io_off + io_len > i_size) io_len = i_size - io_off; - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; - - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } + void *va = kmap(pp); + int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + PAGE_SIZE, va, DMU_READ_PREFETCH); + kunmap(pp); + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); } - return (0); + return (error); } /* - * Uses zfs_fillpage to read data from the file and fill the pages. + * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read + * pp - page to read * * RETURN: 0 on success, error code on failure. * @@ -4015,24 +4012,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) * vp - atime updated */ int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_getpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; - - if (pl == NULL) - return (0); - - if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) - return (err); + znode_t *zp = ITOZ(ip); + int error; - err = zfs_fillpage(ip, pl, nr_pages); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); + error = zfs_fillpage(ip, pp); + if (error == 0) + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); zfs_exit(zfsvfs, FTAG); - return (err); + + return (error); } /* diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index e42b15042a3..0a50f80ea68 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -657,29 +657,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) static inline int zpl_readpage_common(struct page *pp) { - struct inode *ip; - struct page *pl[1]; - int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); - ip = pp->mapping->host; - pl[0] = pp; cookie = spl_fstrans_mark(); - error = -zfs_getpage(ip, pl, 1); + int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); - if (error) { - SetPageError(pp); - ClearPageUptodate(pp); - } else { - ClearPageError(pp); - SetPageUptodate(pp); - flush_dcache_page(pp); - } - unlock_page(pp); + return (error); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7a7cf927c77..ed6ec420558 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -687,7 +687,8 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', tags = ['functional', 'migration'] [tests/functional/mmap] -tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos'] +tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', + 'mmap_sync_001_pos', 'mmap_write_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] diff --git a/tests/zfs-tests/cmd/mmap_sync.c b/tests/zfs-tests/cmd/mmap_sync.c index e4d190aef1c..226e71be2f5 100644 --- a/tests/zfs-tests/cmd/mmap_sync.c +++ b/tests/zfs-tests/cmd/mmap_sync.c @@ -59,7 +59,7 @@ main(int argc, char *argv[]) return (1); } - int run_time_mins = 5; + int run_time_mins = 1; if (argc >= 2) { run_time_mins = atoi(argv[1]); } diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ad2ec467055..1a039742a61 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1491,6 +1491,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/migration/setup.ksh \ functional/mmap/cleanup.ksh \ functional/mmap/mmap_libaio_001_pos.ksh \ + functional/mmap/mmap_mixed.ksh \ functional/mmap/mmap_read_001_pos.ksh \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh new file mode 100755 index 00000000000..6c8246d48ac --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmap/mmap.cfg + +# +# DESCRIPTION: +# Verify mixed buffered and mmap IO. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background buffered read/write fio to the file. +# 3. Start a background mmap read/write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$tmp_file" +} + +log_assert "Verify mixed buffered and mmap IO" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +log_must dd if=/dev/zero of=$tmp_file bs=$bs count=$blocks + +# Buffered IO writes +log_must eval "fio --filename=$tmp_file --name=buffer-write \ + --rw=randwrite --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# Buffered IO reads +log_must eval "fio --filename=$tmp_file --name=buffer-read \ + --rw=randread --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap IO writes +log_must eval "fio --filename=$tmp_file --name=mmap-write \ + --rw=randwrite --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap IO reads +log_must eval "fio --filename=$tmp_file --name=mmap-read \ + --rw=randread --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +log_must wait + +log_pass "Verfied mixed buffered and mmap IO" -- 2.45.0