From d6a303386fe79187c5e7a22efe2bede77fd043ed Mon Sep 17 00:00:00 2001 From: asomers Date: Tue, 11 Jun 2019 16:32:33 +0000 Subject: [PATCH] fusefs: WIP fixing writeback cacheing The current "writeback" cache mode, selected by the vfs.fusefs.data_cache_mode sysctl, doesn't do writeback cacheing at all. It merely goes through the motions of using buf(9), but then writes every buffer synchronously. This commit: * Enables delayed writes when the sysctl is set to writeback cacheing * Fixes a cache-coherency problem when extending a file whose last page has just been written. * Removes the "sync" mount option, which had been set unconditionally. * Adjusts some SDT probes * Adds several new tests that mimic what fsx does but with more control and without a real file system. As I discover failures with fsx, I add regression tests to this file. * Adds a test that ensures we can append to a file without reading any data from it. This change is still incomplete. Clustered writing is not yet supported, and there are frequent "panic: vm_fault_hold: fault on nofault entry" panics that I need to fix. Sponsored by: The FreeBSD Foundation --- sys/fs/fuse/fuse_io.c | 71 +++++++-- sys/fs/fuse/fuse_vfsops.c | 3 - tests/sys/fs/fusefs/Makefile | 1 + tests/sys/fs/fusefs/io.cc | 276 +++++++++++++++++++++++++++++++++++ tests/sys/fs/fusefs/write.cc | 40 ++++- 5 files changed, 374 insertions(+), 17 deletions(-) create mode 100644 tests/sys/fs/fusefs/io.cc diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c index 2b678643426..da67ad5c9ab 100644 --- a/sys/fs/fuse/fuse_io.c +++ b/sys/fs/fuse/fuse_io.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -265,9 +266,10 @@ fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, bool pages, return (err); } -SDT_PROBE_DEFINE3(fusefs, , io, read_bio_backend_start, "int", "int", "int"); +SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "int"); -SDT_PROBE_DEFINE3(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int"); +SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", + "struct buf*"); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) @@ -297,9 +299,6 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); - SDT_PROBE3(fusefs, , io, read_bio_backend_start, - biosize, (int)lbn, on); - if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { @@ -308,6 +307,9 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, bcount = biosize; } + SDT_PROBE4(fusefs, , io, read_bio_backend_start, + biosize, (int)lbn, on, bcount); + /* TODO: readahead. See ext2_read for an example */ err = bread(vp, lbn, bcount, NOCRED, &bp); if (err) { @@ -333,8 +335,8 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, err = uiomove(bp->b_data + on, n, uio); } vfs_bio_brelse(bp, ioflag); - SDT_PROBE3(fusefs, , io, read_bio_backend_end, err, - uio->uio_resid, n); + SDT_PROBE4(fusefs, , io, read_bio_backend_end, err, + uio->uio_resid, n, bp); } return (err); @@ -564,6 +566,7 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio, off_t filesize; int bcount; int n, on, err = 0; + bool last_page; const int biosize = fuse_iosize(vp); @@ -612,6 +615,11 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio, extending = true; bcount = on + n; } + if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >= + howmany(filesize, PAGE_SIZE)) + last_page = true; + else + last_page = false; if (direct_append) { /* * Take care to preserve the buffer's B_CACHE state so @@ -642,6 +650,8 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio, break; } } + if (biosize > bcount) + vfs_bio_clrbuf(bp); SDT_PROBE6(fusefs, , io, write_biobackend_start, lbn, on, n, uio, bcount, direct_append); @@ -689,7 +699,6 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio, * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ - if (bp->b_dirtyend > bcount) { SDT_PROBE2(fusefs, , io, write_biobackend_append_race, (long)bp->b_blkno * biosize, @@ -738,6 +747,7 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio, bp->b_error = err; brelse(bp); break; + /* TODO: vfs_bio_clrbuf like ffs_write does? */ } /* * Only update dirtyoff/dirtyend if not a degenerate @@ -753,7 +763,42 @@ fuse_write_biobackend(struct vnode *vp, struct uio *uio, } vfs_bio_set_valid(bp, on, n); } - err = bwrite(bp); + + vfs_bio_set_flags(bp, ioflag); + + if (last_page) { + /* + * When writing the last page of a file we must write + * synchronously. If we didn't, then a subsequent + * operation could extend the file, making the last + * page of this buffer invalid because it would only be + * partially cached. + * + * As an optimization, it would be allowable to only + * write the last page synchronously. Or, it should be + * possible to synchronously flush the last + * already-written page whenever extending a file with + * ftruncate or another write. + */ + err = bwrite(bp); + } else if (ioflag & IO_SYNC) { + err = bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { + /* TODO: enable write clustering later */ + bawrite(bp); + } else if (on == 0 && n == bcount) { + if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) + bdwrite(bp); + else + bawrite(bp); + } else if (ioflag & IO_DIRECT) { + bawrite(bp); + } else { + bp->b_flags &= ~B_CLUSTEROK; + bdwrite(bp); + } if (err) break; } while (uio->uio_resid > 0 && n > 0); @@ -819,7 +864,7 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp) io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; - uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; + uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); if (!error && uiop->uio_resid) { @@ -854,14 +899,14 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp) return (error); } - if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > filesize) + if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize) bp->b_dirtyend = filesize - - (off_t)bp->b_blkno * biosize; + (off_t)bp->b_lblkno * biosize; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; - uiop->uio_offset = (off_t)bp->b_blkno * biosize + uiop->uio_offset = (off_t)bp->b_lblkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; diff --git a/sys/fs/fuse/fuse_vfsops.c b/sys/fs/fuse/fuse_vfsops.c index 8b25234ded4..6a5adac8720 100644 --- a/sys/fs/fuse/fuse_vfsops.c +++ b/sys/fs/fuse/fuse_vfsops.c @@ -314,9 +314,6 @@ fuse_vfsop_mount(struct mount *mp) __mntopts = 0; td = curthread; - MNT_ILOCK(mp); - mp->mnt_flag |= MNT_SYNCHRONOUS; - MNT_IUNLOCK(mp); /* Get the new options passed to mount */ opts = mp->mnt_optnew; diff --git a/tests/sys/fs/fusefs/Makefile b/tests/sys/fs/fusefs/Makefile index 2e4f6cb6ed4..8aa07933f60 100644 --- a/tests/sys/fs/fusefs/Makefile +++ b/tests/sys/fs/fusefs/Makefile @@ -21,6 +21,7 @@ GTESTS+= fsync GTESTS+= fsyncdir GTESTS+= getattr GTESTS+= interrupt +GTESTS+= io GTESTS+= link GTESTS+= locks GTESTS+= lookup diff --git a/tests/sys/fs/fusefs/io.cc b/tests/sys/fs/fusefs/io.cc new file mode 100644 index 00000000000..3a04626f0f5 --- /dev/null +++ b/tests/sys/fs/fusefs/io.cc @@ -0,0 +1,276 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 The FreeBSD Foundation + * + * This software was developed by BFF Storage Systems, LLC under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +extern "C" { +#include +#include +#include +} + +#include "mockfs.hh" +#include "utils.hh" + +/* + * For testing I/O like fsx does, but deterministically and without a real + * underlying file system + * + * TODO: after fusefs gains the options to select cache mode for each mount + * point, run each of these tests for all cache modes. + */ + +using namespace testing; + +const char FULLPATH[] = "mountpoint/some_file.txt"; +const char RELPATH[] = "some_file.txt"; +const uint64_t ino = 42; + +class Io: public FuseTest { +public: +int m_backing_fd, m_control_fd, m_test_fd; + +Io(): m_backing_fd(-1), m_control_fd(-1) {}; + +void SetUp() +{ + m_backing_fd = open("backing_file", O_RDWR | O_CREAT | O_TRUNC); + if (m_backing_fd < 0) + FAIL() << strerror(errno); + m_control_fd = open("control", O_RDWR | O_CREAT | O_TRUNC); + if (m_control_fd < 0) + FAIL() << strerror(errno); + srandom(22'9'1982); // Seed with my birthday + FuseTest::SetUp(); + if (IsSkipped()) + return; + + expect_lookup(RELPATH, ino, S_IFREG | 0644, 0, 1); + expect_open(ino, 0, 1); + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_WRITE && + in.header.nodeid == ino); + }, Eq(true)), + _) + ).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) { + const char *buf = (const char*)in.body.bytes + + sizeof(struct fuse_write_in); + ssize_t isize = in.body.write.size; + off_t iofs = in.body.write.offset; + + ASSERT_EQ(isize, pwrite(m_backing_fd, buf, isize, iofs)) + << strerror(errno); + SET_OUT_HEADER_LEN(out, write); + out.body.write.size = isize; + }))); + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_READ && + in.header.nodeid == ino); + }, Eq(true)), + _) + ).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) { + ssize_t isize = in.body.write.size; + off_t iofs = in.body.write.offset; + void *buf = out.body.bytes; + + ASSERT_LE(0, pread(m_backing_fd, buf, isize, iofs)) + << strerror(errno); + out.header.len = sizeof(struct fuse_out_header) + isize; + }))); + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + uint32_t valid = FATTR_SIZE | FATTR_FH; + return (in.header.opcode == FUSE_SETATTR && + in.header.nodeid == ino && + in.body.setattr.valid == valid); + }, Eq(true)), + _) + ).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) { + ASSERT_EQ(0, ftruncate(m_backing_fd, in.body.setattr.size)) + << strerror(errno); + SET_OUT_HEADER_LEN(out, attr); + out.body.attr.attr.ino = ino; + out.body.attr.attr.mode = S_IFREG | 0755; + out.body.attr.attr.size = in.body.setattr.size; + out.body.attr.attr_valid = UINT64_MAX; + }))); + /* Any test that close()s will send FUSE_FLUSH and FUSE_RELEASE */ + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_FLUSH && + in.header.nodeid == ino); + }, Eq(true)), + _) + ).WillRepeatedly(Invoke(ReturnErrno(0))); + EXPECT_CALL(*m_mock, process( + ResultOf([=](auto in) { + return (in.header.opcode == FUSE_RELEASE && + in.header.nodeid == ino); + }, Eq(true)), + _) + ).WillRepeatedly(Invoke(ReturnErrno(0))); + + m_test_fd = open(FULLPATH, O_RDWR ); + EXPECT_LE(0, m_test_fd) << strerror(errno); +} + +void TearDown() +{ + if (m_backing_fd >= 0) + close(m_backing_fd); + if (m_control_fd >= 0) + close(m_control_fd); + FuseTest::TearDown(); + /* Deliberately leak test_fd */ +} + +void do_ftruncate(off_t offs) +{ + ASSERT_EQ(0, ftruncate(m_test_fd, offs)) << strerror(errno); + ASSERT_EQ(0, ftruncate(m_control_fd, offs)) << strerror(errno); +} + +void do_read(ssize_t size, off_t offs) +{ + void *test_buf, *control_buf; + + test_buf = malloc(size); + ASSERT_NE(NULL, test_buf) << strerror(errno); + control_buf = malloc(size); + ASSERT_NE(NULL, control_buf) << strerror(errno); + + ASSERT_EQ(size, pread(m_test_fd, test_buf, size, offs)) + << strerror(errno); + ASSERT_EQ(size, pread(m_control_fd, control_buf, size, offs)) + << strerror(errno); + + ASSERT_EQ(0, memcmp(test_buf, control_buf, size)); + + free(control_buf); + free(test_buf); +} + +void do_write(ssize_t size, off_t offs) +{ + char *buf; + long i; + + buf = (char*)malloc(size); + ASSERT_NE(NULL, buf) << strerror(errno); + for (i=0; i < size; i++) + buf[i] = random(); + + ASSERT_EQ(size, pwrite(m_test_fd, buf, size, offs )) + << strerror(errno); + ASSERT_EQ(size, pwrite(m_control_fd, buf, size, offs)) + << strerror(errno); +} + +}; + +/* + * Extend a file with dirty data in the last page of the last block. + * + * fsx -WR -P /tmp -S8 -N3 fsx.bin + */ +TEST_F(Io, extend_from_dirty_page) +{ + off_t wofs = 0x21a0; + ssize_t wsize = 0xf0a8; + off_t rofs = 0xb284; + ssize_t rsize = 0x9b22; + off_t truncsize = 0x28702; + + do_write(wsize, wofs); + do_ftruncate(truncsize); + do_read(rsize, rofs); +} + +/* + * When writing the last page of a file, it must be written synchronously. + * Otherwise the cached page can become invalid by a subsequent extend + * operation. + * + * fsx -WR -P /tmp -S642 -N3 fsx.bin + */ +TEST_F(Io, last_page) +{ + off_t wofs0 = 0x1134f; + ssize_t wsize0 = 0xcc77; + off_t wofs1 = 0x2096a; + ssize_t wsize1 = 0xdfa7; + off_t rofs = 0x1a3aa; + ssize_t rsize = 0xb5b7; + + do_write(wsize0, wofs0); + do_write(wsize1, wofs1); + do_read(rsize, rofs); +} + +/* + * Read a hole from a block that contains some cached data. + * + * fsx -WR -P /tmp -S55 fsx.bin + */ +TEST_F(Io, read_hole_from_cached_block) +{ + off_t wofs = 0x160c5; + ssize_t wsize = 0xa996; + off_t rofs = 0x472e; + ssize_t rsize = 0xd8d5; + + do_write(wsize, wofs); + do_read(rsize, rofs); +} + +/* + * Reliable panic; I don't yet know why. + * Disabled because it panics. + * + * fsx -WR -P /tmp -S839 -d -N6 fsx.bin + */ +TEST_F(Io, DISABLED_fault_on_nofault_entry) +{ + off_t wofs0 = 0x3bad7; + ssize_t wsize0 = 0x4529; + off_t wofs1 = 0xc30d; + ssize_t wsize1 = 0x5f77; + off_t truncsize0 = 0x10916; + off_t rofs = 0xdf17; + ssize_t rsize = 0x29ff; + off_t truncsize1 = 0x152b4; + + do_write(wsize0, wofs0); + do_write(wsize1, wofs1); + do_ftruncate(truncsize0); + do_read(rsize, rofs); + do_ftruncate(truncsize1); + close(m_test_fd); +} diff --git a/tests/sys/fs/fusefs/write.cc b/tests/sys/fs/fusefs/write.cc index 242f442c6a9..31b7f90a585 100644 --- a/tests/sys/fs/fusefs/write.cc +++ b/tests/sys/fs/fusefs/write.cc @@ -249,6 +249,44 @@ TEST_F(Write, append) /* Deliberately leak fd. close(2) will be tested in release.cc */ } +/* If a file is cached, then appending to the end should not cause a read */ +TEST_F(Write, append_to_cached) +{ + const ssize_t BUFSIZE = 9; + const char FULLPATH[] = "mountpoint/some_file.txt"; + const char RELPATH[] = "some_file.txt"; + char *oldcontents, *oldbuf; + const char CONTENTS[BUFSIZE] = "abcdefgh"; + uint64_t ino = 42; + /* + * Set offset in between maxbcachebuf boundary to test buffer handling + */ + uint64_t oldsize = m_maxbcachebuf / 2; + int fd; + + oldcontents = (char*)calloc(1, oldsize); + ASSERT_NE(NULL, oldcontents) << strerror(errno); + oldbuf = (char*)malloc(oldsize); + ASSERT_NE(NULL, oldbuf) << strerror(errno); + + expect_lookup(RELPATH, ino, oldsize); + expect_open(ino, 0, 1); + expect_read(ino, 0, oldsize, oldsize, oldcontents); + expect_write(ino, oldsize, BUFSIZE, BUFSIZE, CONTENTS); + + /* Must open O_RDWR or fuse(4) implicitly sets direct_io */ + fd = open(FULLPATH, O_RDWR | O_APPEND); + EXPECT_LE(0, fd) << strerror(errno); + + /* Read the old data into the cache */ + ASSERT_EQ((ssize_t)oldsize, read(fd, oldbuf, oldsize)) + << strerror(errno); + + /* Write the new data. There should be no more read operations */ + ASSERT_EQ(BUFSIZE, write(fd, CONTENTS, BUFSIZE)) << strerror(errno); + /* Deliberately leak fd. close(2) will be tested in release.cc */ +} + TEST_F(Write, append_direct_io) { const ssize_t BUFSIZE = 9; @@ -789,7 +827,7 @@ TEST_F(WriteThrough, DISABLED_writethrough) EXPECT_LE(0, fd) << strerror(errno); ASSERT_EQ(bufsize, write(fd, CONTENTS, bufsize)) << strerror(errno); - /* + /* * A subsequent read should be serviced by cache, without querying the * filesystem daemon */ -- 2.45.0