2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2013 Juniper Networks, Inc.
5 * Copyright (c) 2022-2023 Klara, Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include "opt_tarfs.h"
30 #include "opt_zstdio.h"
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/counter.h>
37 #include <sys/malloc.h>
38 #include <sys/mount.h>
39 #include <sys/sysctl.h>
41 #include <sys/vnode.h>
50 #define ZSTD_STATIC_LINKING_ONLY
51 #include <contrib/zstd/lib/zstd.h>
54 #include <fs/tarfs/tarfs.h>
55 #include <fs/tarfs/tarfs_dbg.h>
58 SYSCTL_NODE(_vfs_tarfs, OID_AUTO, zio, CTLFLAG_RD, 0,
59 "Tar filesystem decompression layer");
60 COUNTER_U64_DEFINE_EARLY(tarfs_zio_inflated);
61 SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, inflated, CTLFLAG_RD,
62 &tarfs_zio_inflated, "Amount of compressed data inflated.");
63 COUNTER_U64_DEFINE_EARLY(tarfs_zio_consumed);
64 SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, consumed, CTLFLAG_RD,
65 &tarfs_zio_consumed, "Amount of compressed data consumed.");
66 COUNTER_U64_DEFINE_EARLY(tarfs_zio_bounced);
67 SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, bounced, CTLFLAG_RD,
68 &tarfs_zio_bounced, "Amount of decompressed data bounced.");
71 tarfs_sysctl_handle_zio_reset(SYSCTL_HANDLER_ARGS)
77 if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0)
79 if (req->newptr != NULL) {
80 if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0)
82 counter_u64_zero(tarfs_zio_inflated);
83 counter_u64_zero(tarfs_zio_consumed);
84 counter_u64_zero(tarfs_zio_bounced);
89 SYSCTL_PROC(_vfs_tarfs_zio, OID_AUTO, reset,
90 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW,
91 NULL, 0, tarfs_sysctl_handle_zio_reset, "IU",
92 "Reset compression counters.");
95 MALLOC_DEFINE(M_TARFSZSTATE, "tarfs zstate", "tarfs decompression state");
96 MALLOC_DEFINE(M_TARFSZBUF, "tarfs zbuf", "tarfs decompression buffers");
98 #define XZ_MAGIC (uint8_t[]){ 0xfd, 0x37, 0x7a, 0x58, 0x5a }
99 #define ZLIB_MAGIC (uint8_t[]){ 0x1f, 0x8b, 0x08 }
100 #define ZSTD_MAGIC (uint8_t[]){ 0x28, 0xb5, 0x2f, 0xfd }
108 /* XXX review use of curthread / uio_td / td_cred */
111 * Reads from the tar file according to the provided uio. If the archive
112 * is compressed and raw is false, reads the decompressed stream;
113 * otherwise, reads directly from the original file. Returns 0 on success
114 * and a positive errno value on failure.
117 tarfs_io_read(struct tarfs_mount *tmp, bool raw, struct uio *uiop)
120 off_t off = uiop->uio_offset;
121 size_t len = uiop->uio_resid;
124 if (raw || tmp->znode == NULL) {
125 rl = vn_rangelock_rlock(tmp->vp, off, off + len);
126 error = vn_lock(tmp->vp, LK_SHARED);
128 error = VOP_READ(tmp->vp, uiop,
129 IO_DIRECT|IO_NODELOCKED,
130 uiop->uio_td->td_ucred);
133 vn_rangelock_unlock(tmp->vp, rl);
135 error = vn_lock(tmp->znode, LK_EXCLUSIVE);
137 error = VOP_READ(tmp->znode, uiop,
138 IO_DIRECT | IO_NODELOCKED,
139 uiop->uio_td->td_ucred);
140 VOP_UNLOCK(tmp->znode);
143 TARFS_DPF(IO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
144 (size_t)off, len, error, uiop->uio_resid);
149 * Reads from the tar file into the provided buffer. If the archive is
150 * compressed and raw is false, reads the decompressed stream; otherwise,
151 * reads directly from the original file. Returns the number of bytes
152 * read on success, 0 on EOF, and a negative errno value on failure.
155 tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
156 void *buf, off_t off, size_t len)
164 TARFS_DPF(IO, "%s(%zu, %zu) null\n", __func__,
170 auio.uio_iov = &aiov;
172 auio.uio_offset = off;
173 auio.uio_segflg = UIO_SYSSPACE;
174 auio.uio_rw = UIO_READ;
175 auio.uio_resid = len;
176 auio.uio_td = curthread;
177 error = tarfs_io_read(tmp, raw, &auio);
179 TARFS_DPF(IO, "%s(%zu, %zu) error %d\n", __func__,
180 (size_t)off, len, error);
183 res = len - auio.uio_resid;
184 if (res == 0 && len != 0) {
185 TARFS_DPF(IO, "%s(%zu, %zu) eof\n", __func__,
188 TARFS_DPF(IO, "%s(%zu, %zu) read %zd | %*D\n", __func__,
189 (size_t)off, len, res,
190 (int)(res > 8 ? 8 : res), (uint8_t *)buf, " ");
197 tarfs_zstate_alloc(void *opaque, size_t size)
201 return (malloc(size, M_TARFSZSTATE, M_WAITOK));
207 tarfs_zstate_free(void *opaque, void *address)
211 free(address, M_TARFSZSTATE);
216 static ZSTD_customMem tarfs_zstd_mem = {
225 * Updates the decompression frame index, recording the current input and
226 * output offsets in a new index entry, and growing the index if
230 tarfs_zio_update_index(struct tarfs_zio *zio, off_t i, off_t o)
233 if (++zio->curidx >= zio->nidx) {
234 if (++zio->nidx > zio->szidx) {
236 zio->idx = realloc(zio->idx,
237 zio->szidx * sizeof(*zio->idx),
238 M_TARFSZSTATE, M_ZERO | M_WAITOK);
239 TARFS_DPF(ALLOC, "%s: resized zio index\n", __func__);
241 zio->idx[zio->curidx].i = i;
242 zio->idx[zio->curidx].o = o;
243 TARFS_DPF(ZIDX, "%s: index %u = i %zu o %zu\n", __func__,
244 zio->curidx, (size_t)zio->idx[zio->curidx].i,
245 (size_t)zio->idx[zio->curidx].o);
247 MPASS(zio->idx[zio->curidx].i == i);
248 MPASS(zio->idx[zio->curidx].o == o);
253 * VOP_ACCESS for zio node.
256 tarfs_zaccess(struct vop_access_args *ap)
258 struct vnode *vp = ap->a_vp;
259 struct tarfs_zio *zio = vp->v_data;
260 struct tarfs_mount *tmp = zio->tmp;
261 accmode_t accmode = ap->a_accmode;
264 if (accmode == VREAD) {
265 error = vn_lock(tmp->vp, LK_SHARED);
267 error = VOP_ACCESS(tmp->vp, accmode, ap->a_cred, ap->a_td);
271 TARFS_DPF(ZIO, "%s(%d) = %d\n", __func__, accmode, error);
276 * VOP_GETATTR for zio node.
279 tarfs_zgetattr(struct vop_getattr_args *ap)
282 struct vnode *vp = ap->a_vp;
283 struct tarfs_zio *zio = vp->v_data;
284 struct tarfs_mount *tmp = zio->tmp;
285 struct vattr *vap = ap->a_vap;
289 error = vn_lock(tmp->vp, LK_SHARED);
291 error = VOP_GETATTR(tmp->vp, &va, ap->a_cred);
295 vap->va_mode = va.va_mode;
297 vap->va_gid = va.va_gid;
298 vap->va_uid = va.va_uid;
299 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
300 vap->va_fileid = TARFS_ZIOINO;
301 vap->va_size = zio->idx[zio->nidx - 1].o;
302 vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
303 vap->va_atime = va.va_atime;
304 vap->va_ctime = va.va_ctime;
305 vap->va_mtime = va.va_mtime;
306 vap->va_birthtime = tmp->root->birthtime;
307 vap->va_bytes = va.va_bytes;
310 TARFS_DPF(ZIO, "%s() = %d\n", __func__, error);
316 * VOP_READ for zio node, zstd edition.
319 tarfs_zread_zstd(struct tarfs_zio *zio, struct uio *uiop)
321 void *ibuf = NULL, *obuf = NULL, *rl = NULL;
324 struct tarfs_mount *tmp = zio->tmp;
325 struct tarfs_zstd *zstd = zio->zstd;
326 struct thread *td = curthread;
333 off_t off = uiop->uio_offset;
334 size_t len = uiop->uio_resid;
335 size_t resid = uiop->uio_resid;
340 /* do we have to rewind? */
341 if (off < zio->opos) {
342 while (zio->curidx > 0 && off < zio->idx[zio->curidx].o)
346 /* advance to the nearest index entry */
347 if (off > zio->opos) {
348 // XXX maybe do a binary search instead
349 while (zio->curidx < zio->nidx - 1 &&
350 off >= zio->idx[zio->curidx + 1].o) {
355 /* reset the decompression stream if needed */
357 zio->ipos = zio->idx[zio->curidx].i;
358 zio->opos = zio->idx[zio->curidx].o;
359 ZSTD_resetDStream(zstd->zds);
360 TARFS_DPF(ZIDX, "%s: skipping to index %u = i %zu o %zu\n", __func__,
361 zio->curidx, (size_t)zio->ipos, (size_t)zio->opos);
363 TARFS_DPF(ZIDX, "%s: continuing at i %zu o %zu\n", __func__,
364 (size_t)zio->ipos, (size_t)zio->opos);
368 * Set up a temporary buffer for compressed data. Use the size
369 * recommended by the zstd library; this is usually 128 kB, but
370 * just in case, make sure it's a multiple of the page size and no
371 * larger than MAXBSIZE.
373 bsize = roundup(ZSTD_CStreamOutSize(), PAGE_SIZE);
374 if (bsize > MAXBSIZE)
376 ibuf = malloc(bsize, M_TEMP, M_WAITOK);
382 * Set up the decompression buffer. If the target is not in
383 * kernel space, we will have to set up a bounce buffer.
385 * TODO: to avoid using a bounce buffer, map destination pages
386 * using vm_fault_quick_hold_pages().
388 MPASS(zio->opos <= off);
389 MPASS(uiop->uio_iovcnt == 1);
390 MPASS(uiop->uio_iov->iov_len >= len);
391 if (uiop->uio_segflg == UIO_SYSSPACE) {
392 zob.dst = uiop->uio_iov->iov_base;
394 TARFS_DPF(BOUNCE, "%s: allocating %zu-byte bounce buffer\n",
396 zob.dst = obuf = malloc(len, M_TEMP, M_WAITOK);
402 rl = vn_rangelock_rlock(tmp->vp, zio->ipos, OFF_MAX);
403 error = vn_lock(tmp->vp, LK_SHARED);
408 error = vn_getsize_locked(tmp->vp, &zsize, td->td_ucred);
412 if (zio->ipos >= zsize) {
418 if (zib.pos == zib.size) {
419 /* request data from the underlying file */
420 aiov.iov_base = ibuf;
421 aiov.iov_len = bsize;
422 auio.uio_iov = &aiov;
424 auio.uio_offset = zio->ipos;
425 auio.uio_segflg = UIO_SYSSPACE;
426 auio.uio_rw = UIO_READ;
427 auio.uio_resid = aiov.iov_len;
429 error = VOP_READ(tmp->vp, &auio,
430 IO_DIRECT | IO_NODELOCKED,
434 TARFS_DPF(ZIO, "%s: req %zu+%zu got %zu+%zu\n", __func__,
435 (size_t)zio->ipos, bsize,
436 (size_t)zio->ipos, bsize - auio.uio_resid);
438 zib.size = bsize - auio.uio_resid;
441 MPASS(zib.pos <= zib.size);
442 if (zib.pos == zib.size) {
443 TARFS_DPF(ZIO, "%s: end of file after i %zu o %zu\n", __func__,
444 (size_t)zio->ipos, (size_t)zio->opos);
447 if (zio->opos < off) {
448 /* to be discarded */
449 zob.size = min(off - zio->opos, len);
453 zob.pos = zio->opos - off;
457 /* decompress as much as possible */
458 zerror = ZSTD_decompressStream(zstd->zds, &zob, &zib);
459 zio->ipos += ilen = zib.pos - ipos;
460 zio->opos += olen = zob.pos - opos;
463 if (ZSTD_isError(zerror)) {
464 TARFS_DPF(ZIO, "%s: inflate failed after i %zu o %zu: %s\n", __func__,
465 (size_t)zio->ipos, (size_t)zio->opos, ZSTD_getErrorName(zerror));
469 if (zerror == 0 && olen == 0) {
470 TARFS_DPF(ZIO, "%s: end of stream after i %zu o %zu\n", __func__,
471 (size_t)zio->ipos, (size_t)zio->opos);
475 TARFS_DPF(ZIO, "%s: end of frame after i %zu o %zu\n", __func__,
476 (size_t)zio->ipos, (size_t)zio->opos);
477 tarfs_zio_update_index(zio, zio->ipos, zio->opos);
479 TARFS_DPF(ZIO, "%s: inflated %zu\n", __func__, olen);
481 counter_u64_add(tarfs_zio_inflated, olen);
488 if (uiop->uio_segflg == UIO_SYSSPACE) {
489 uiop->uio_resid = resid;
490 } else if (len > resid) {
491 TARFS_DPF(BOUNCE, "%s: bounced %zu bytes\n", __func__,
493 error = uiomove(obuf, len - resid, uiop);
495 counter_u64_add(tarfs_zio_bounced, len - resid);
500 TARFS_DPF(BOUNCE, "%s: freeing bounce buffer\n", __func__);
504 vn_rangelock_unlock(tmp->vp, rl);
507 TARFS_DPF(ZIO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
508 (size_t)off, len, error, uiop->uio_resid);
510 counter_u64_add(tarfs_zio_consumed, len - uiop->uio_resid);
514 zio->ipos = zio->idx[0].i;
515 zio->opos = zio->idx[0].o;
516 ZSTD_resetDStream(zstd->zds);
523 * VOP_READ for zio node.
526 tarfs_zread(struct vop_read_args *ap)
528 #if defined(TARFS_DEBUG) || defined(ZSTDIO)
529 struct vnode *vp = ap->a_vp;
530 struct tarfs_zio *zio = vp->v_data;
531 struct uio *uiop = ap->a_uio;
534 off_t off = uiop->uio_offset;
535 size_t len = uiop->uio_resid;
539 TARFS_DPF(ZIO, "%s(%zu, %zu)\n", __func__,
542 if (zio->zstd != NULL) {
543 error = tarfs_zread_zstd(zio, uiop);
547 TARFS_DPF(ZIO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
548 (size_t)off, len, error, uiop->uio_resid);
553 * VOP_RECLAIM for zio node.
556 tarfs_zreclaim(struct vop_reclaim_args *ap)
558 struct vnode *vp = ap->a_vp;
560 TARFS_DPF(ZIO, "%s(%p)\n", __func__, vp);
566 * VOP_STRATEGY for zio node.
569 tarfs_zstrategy(struct vop_strategy_args *ap)
573 struct vnode *vp = ap->a_vp;
574 struct buf *bp = ap->a_bp;
579 iov.iov_base = bp->b_data;
580 iov.iov_len = bp->b_bcount;
581 off = bp->b_iooffset;
586 auio.uio_offset = off;
587 auio.uio_resid = len;
588 auio.uio_segflg = UIO_SYSSPACE;
589 auio.uio_rw = UIO_READ;
590 auio.uio_td = curthread;
591 error = VOP_READ(vp, &auio, IO_DIRECT | IO_NODELOCKED, bp->b_rcred);
592 bp->b_flags |= B_DONE;
594 bp->b_ioflags |= BIO_ERROR;
600 static struct vop_vector tarfs_znodeops = {
601 .vop_default = &default_vnodeops,
603 .vop_access = tarfs_zaccess,
604 .vop_getattr = tarfs_zgetattr,
605 .vop_read = tarfs_zread,
606 .vop_reclaim = tarfs_zreclaim,
607 .vop_strategy = tarfs_zstrategy,
609 VFS_VOP_VECTOR_REGISTER(tarfs_znodeops);
613 * Initializes the decompression layer.
615 static struct tarfs_zio *
616 tarfs_zio_init(struct tarfs_mount *tmp, off_t i, off_t o)
618 struct tarfs_zio *zio;
621 zio = malloc(sizeof(*zio), M_TARFSZSTATE, M_ZERO | M_WAITOK);
622 TARFS_DPF(ALLOC, "%s: allocated zio\n", __func__);
625 zio->idx = malloc(zio->szidx * sizeof(*zio->idx), M_TARFSZSTATE,
629 zio->idx[zio->curidx].i = zio->ipos = i;
630 zio->idx[zio->curidx].o = zio->opos = o;
632 TARFS_DPF(ALLOC, "%s: allocated zio index\n", __func__);
633 (void)getnewvnode("tarfsz", tmp->vfs, &tarfs_znodeops, &zvp);
636 zvp->v_mount = tmp->vfs;
637 vn_set_state(zvp, VSTATE_CONSTRUCTED);
639 TARFS_DPF(ZIO, "%s: created zio node\n", __func__);
645 * Initializes the I/O layer, including decompression if the signature of
646 * a supported compression format is detected. Returns 0 on success and a
647 * positive errno value on failure.
650 tarfs_io_init(struct tarfs_mount *tmp)
654 struct tarfs_zio *zio = NULL;
659 block = malloc(tmp->iosize, M_TEMP, M_ZERO | M_WAITOK);
660 res = tarfs_io_read_buf(tmp, true, block, 0, tmp->iosize);
664 if (memcmp(block, XZ_MAGIC, sizeof(XZ_MAGIC)) == 0) {
665 printf("xz compression not supported\n");
668 } else if (memcmp(block, ZLIB_MAGIC, sizeof(ZLIB_MAGIC)) == 0) {
669 printf("zlib compression not supported\n");
672 } else if (memcmp(block, ZSTD_MAGIC, sizeof(ZSTD_MAGIC)) == 0) {
674 zio = tarfs_zio_init(tmp, 0, 0);
675 zio->zstd = malloc(sizeof(*zio->zstd), M_TARFSZSTATE, M_WAITOK);
676 zio->zstd->zds = ZSTD_createDStream_advanced(tarfs_zstd_mem);
677 (void)ZSTD_initDStream(zio->zstd->zds);
679 printf("zstd compression not supported\n");
691 * Tears down the decompression layer.
694 tarfs_zio_fini(struct tarfs_mount *tmp)
696 struct tarfs_zio *zio = tmp->zio;
699 if (tmp->znode != NULL) {
700 error = vn_lock(tmp->znode, LK_EXCLUSIVE);
702 TARFS_DPF(ALLOC, "%s: failed to lock znode", __func__);
705 tmp->znode->v_mount = NULL;
711 if (zio->zstd != NULL) {
712 TARFS_DPF(ALLOC, "%s: freeing zstd state\n", __func__);
713 ZSTD_freeDStream(zio->zstd->zds);
714 free(zio->zstd, M_TARFSZSTATE);
717 if (zio->idx != NULL) {
718 TARFS_DPF(ALLOC, "%s: freeing index\n", __func__);
719 free(zio->idx, M_TARFSZSTATE);
721 TARFS_DPF(ALLOC, "%s: freeing zio\n", __func__);
722 free(zio, M_TARFSZSTATE);
729 * Tears down the I/O layer, including the decompression layer if
733 tarfs_io_fini(struct tarfs_mount *tmp)
738 if (tmp->zio != NULL) {
739 error = tarfs_zio_fini(tmp);