2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
47 #include <sys/zstd/zstd.h>
49 #define ZSTD_STATIC_LINKING_ONLY
51 #include "lib/zstd_errors.h"
53 kstat_t *zstd_ksp = NULL;
55 typedef struct zstd_stats {
56 kstat_named_t zstd_stat_alloc_fail;
57 kstat_named_t zstd_stat_alloc_fallback;
58 kstat_named_t zstd_stat_com_alloc_fail;
59 kstat_named_t zstd_stat_dec_alloc_fail;
60 kstat_named_t zstd_stat_com_inval;
61 kstat_named_t zstd_stat_dec_inval;
62 kstat_named_t zstd_stat_dec_header_inval;
63 kstat_named_t zstd_stat_com_fail;
64 kstat_named_t zstd_stat_dec_fail;
67 static zstd_stats_t zstd_stats = {
68 { "alloc_fail", KSTAT_DATA_UINT64 },
69 { "alloc_fallback", KSTAT_DATA_UINT64 },
70 { "compress_alloc_fail", KSTAT_DATA_UINT64 },
71 { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
72 { "compress_level_invalid", KSTAT_DATA_UINT64 },
73 { "decompress_level_invalid", KSTAT_DATA_UINT64 },
74 { "decompress_header_invalid", KSTAT_DATA_UINT64 },
75 { "compress_failed", KSTAT_DATA_UINT64 },
76 { "decompress_failed", KSTAT_DATA_UINT64 },
79 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
81 ZSTD_KMEM_UNKNOWN = 0,
82 /* Allocation type using kmem_vmalloc */
84 /* Pool based allocation using mempool_alloc */
86 /* Reserved fallback memory for decompression only */
91 /* Structure for pooled memory objects */
99 /* Global structure for handling memory allocations */
101 enum zstd_kmem_type kmem_type;
103 struct zstd_pool *pool;
106 /* Fallback memory structure used for decompression only if memory runs out */
107 struct zstd_fallback_mem {
113 struct zstd_levelmap {
115 enum zio_zstd_levels level;
119 * ZSTD memory handlers
121 * For decompression we use a different handler which also provides fallback
122 * memory allocation in case memory runs out.
124 * The ZSTD handlers were split up for the most simplified implementation.
126 static void *zstd_alloc(void *opaque, size_t size);
127 static void *zstd_dctx_alloc(void *opaque, size_t size);
128 static void zstd_free(void *opaque, void *ptr);
130 /* Compression memory handler */
131 static const ZSTD_customMem zstd_malloc = {
137 /* Decompression memory handler */
138 static const ZSTD_customMem zstd_dctx_malloc = {
144 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
145 static struct zstd_levelmap zstd_levels[] = {
146 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
147 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
148 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
149 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
150 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
151 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
152 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
153 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
154 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
155 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
156 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
157 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
158 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
159 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
160 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
161 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
162 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
163 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
164 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
165 {-1, ZIO_ZSTD_LEVEL_FAST_1},
166 {-2, ZIO_ZSTD_LEVEL_FAST_2},
167 {-3, ZIO_ZSTD_LEVEL_FAST_3},
168 {-4, ZIO_ZSTD_LEVEL_FAST_4},
169 {-5, ZIO_ZSTD_LEVEL_FAST_5},
170 {-6, ZIO_ZSTD_LEVEL_FAST_6},
171 {-7, ZIO_ZSTD_LEVEL_FAST_7},
172 {-8, ZIO_ZSTD_LEVEL_FAST_8},
173 {-9, ZIO_ZSTD_LEVEL_FAST_9},
174 {-10, ZIO_ZSTD_LEVEL_FAST_10},
175 {-20, ZIO_ZSTD_LEVEL_FAST_20},
176 {-30, ZIO_ZSTD_LEVEL_FAST_30},
177 {-40, ZIO_ZSTD_LEVEL_FAST_40},
178 {-50, ZIO_ZSTD_LEVEL_FAST_50},
179 {-60, ZIO_ZSTD_LEVEL_FAST_60},
180 {-70, ZIO_ZSTD_LEVEL_FAST_70},
181 {-80, ZIO_ZSTD_LEVEL_FAST_80},
182 {-90, ZIO_ZSTD_LEVEL_FAST_90},
183 {-100, ZIO_ZSTD_LEVEL_FAST_100},
184 {-500, ZIO_ZSTD_LEVEL_FAST_500},
185 {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
189 * This variable represents the maximum count of the pool based on the number
190 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
192 static int pool_count = 16;
194 #define ZSTD_POOL_MAX pool_count
195 #define ZSTD_POOL_TIMEOUT 60 * 2
197 static struct zstd_fallback_mem zstd_dctx_fallback;
198 static struct zstd_pool *zstd_mempool_cctx;
199 static struct zstd_pool *zstd_mempool_dctx;
202 * Try to get a cached allocated buffer from memory pool or allocate a new one
203 * if necessary. If a object is older than 2 minutes and does not fit the
204 * requested size, it will be released and a new cached entry will be allocated.
205 * If other pooled objects are detected without being used for 2 minutes, they
206 * will be released, too.
208 * The concept is that high frequency memory allocations of bigger objects are
209 * expensive. So if a lot of work is going on, allocations will be kept for a
210 * while and can be reused in that time frame.
212 * The scheduled release will be updated every time a object is reused.
215 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
217 struct zstd_pool *pool;
218 struct zstd_kmem *mem = NULL;
224 /* Seek for preallocated memory slot and free obsolete slots */
225 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
226 pool = &zstd_mempool[i];
228 * This lock is simply a marker for a pool object beeing in use.
229 * If it's already hold, it will be skipped.
231 * We need to create it before checking it to avoid race
232 * conditions caused by running in a threaded context.
234 * The lock is later released by zstd_mempool_free.
236 if (mutex_tryenter(&pool->barrier)) {
238 * Check if objects fits the size, if so we take it and
239 * update the timestamp.
241 if (size && !mem && pool->mem && size <= pool->size) {
242 pool->timeout = gethrestime_sec() +
248 /* Free memory if unused object older than 2 minutes */
249 if (pool->mem && gethrestime_sec() > pool->timeout) {
250 vmem_free(pool->mem, pool->size);
256 mutex_exit(&pool->barrier);
265 * If no preallocated slot was found, try to fill in a new one.
267 * We run a similar algorithm twice here to avoid pool fragmentation.
268 * The first one may generate holes in the list if objects get released.
269 * We always make sure that these holes get filled instead of adding new
270 * allocations constantly at the end.
272 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
273 pool = &zstd_mempool[i];
274 if (mutex_tryenter(&pool->barrier)) {
275 /* Object is free, try to allocate new one */
277 mem = vmem_alloc(size, KM_SLEEP);
281 /* Keep track for later release */
284 mem->kmem_type = ZSTD_KMEM_POOL;
285 mem->kmem_size = size;
289 if (size <= pool->size) {
290 /* Update timestamp */
291 pool->timeout = gethrestime_sec() +
297 mutex_exit(&pool->barrier);
302 * If the pool is full or the allocation failed, try lazy allocation
306 mem = vmem_alloc(size, KM_NOSLEEP);
309 mem->kmem_type = ZSTD_KMEM_DEFAULT;
310 mem->kmem_size = size;
317 /* Mark object as released by releasing the barrier mutex */
319 zstd_mempool_free(struct zstd_kmem *z)
321 mutex_exit(&z->pool->barrier);
324 /* Convert ZFS internal enum to ZSTD level */
326 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
328 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
329 *zstd_level = zstd_levels[level - 1].zstd_level;
332 if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
333 level <= ZIO_ZSTD_LEVEL_FAST_1000) {
334 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
335 + ZIO_ZSTD_LEVEL_19].zstd_level;
339 /* Invalid/unknown zfs compression enum - this should never happen. */
343 /* Compress block using zstd */
345 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
353 hdr = (zfs_zstdhdr_t *)d_start;
355 /* Skip compression if the specified level is invalid */
356 if (zstd_enum_to_level(level, &zstd_level)) {
357 ZSTDSTAT_BUMP(zstd_stat_com_inval);
361 ASSERT3U(d_len, >=, sizeof (*hdr));
362 ASSERT3U(d_len, <=, s_len);
363 ASSERT3U(zstd_level, !=, 0);
365 cctx = ZSTD_createCCtx_advanced(zstd_malloc);
368 * Out of kernel memory, gently fall through - this will disable
369 * compression in zio_compress_data
372 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
376 /* Set the compression level */
377 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
379 /* Use the "magicless" zstd header which saves us 4 header bytes */
380 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
383 * Disable redundant checksum calculation and content size storage since
384 * this is already done by ZFS itself.
386 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
387 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
389 c_len = ZSTD_compress2(cctx,
391 d_len - sizeof (*hdr),
396 /* Error in the compression routine, disable compression. */
397 if (ZSTD_isError(c_len)) {
399 * If we are aborting the compression because the saves are
400 * too small, that is not a failure. Everything else is a
401 * failure, so increment the compression failure counter.
403 if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
404 ZSTDSTAT_BUMP(zstd_stat_com_fail);
410 * Encode the compressed buffer size at the start. We'll need this in
411 * decompression to counter the effects of padding which might be added
412 * to the compressed buffer and which, if unhandled, would confuse the
413 * hell out of our decompression function.
415 hdr->c_len = BE_32(c_len);
418 * Check version for overflow.
419 * The limit of 24 bits must not be exceeded. This allows a maximum
420 * version 1677.72.15 which we don't expect to be ever reached.
422 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
425 * Encode the compression level as well. We may need to know the
426 * original compression level if compressed_arc is disabled, to match
427 * the compression settings to write this block to the L2ARC.
429 * Encode the actual level, so if the enum changes in the future, we
430 * will be compatible.
432 * The upper 24 bits store the ZSTD version to be able to provide
433 * future compatibility, since new versions might enhance the
434 * compression algorithm in a way, where the compressed data will
437 * As soon as such incompatibility occurs, handling code needs to be
438 * added, differentiating between the versions.
440 hdr->version = ZSTD_VERSION_NUMBER;
442 hdr->raw_version_level = BE_32(hdr->raw_version_level);
444 return (c_len + sizeof (*hdr));
447 /* Decompress block using zstd and return its stored level */
449 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
450 size_t d_len, uint8_t *level)
456 const zfs_zstdhdr_t *hdr;
457 zfs_zstdhdr_t hdr_copy;
459 hdr = (const zfs_zstdhdr_t *)s_start;
460 c_len = BE_32(hdr->c_len);
463 * Make a copy instead of directly converting the header, since we must
464 * not modify the original data that may be used again later.
466 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
469 * NOTE: We ignore the ZSTD version for now. As soon as any
470 * incompatibility occurrs, it has to be handled accordingly.
471 * The version can be accessed via `hdr_copy.version`.
475 * Convert and check the level
476 * An invalid level is a strong indicator for data corruption! In such
477 * case return an error so the upper layers can try to fix it.
479 if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
480 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
484 ASSERT3U(d_len, >=, s_len);
485 ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
487 /* Invalid compressed buffer size encoded at start */
488 if (c_len + sizeof (*hdr) > s_len) {
489 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
493 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
495 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
499 /* Set header type to "magicless" */
500 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
502 /* Decompress the data and release the context */
503 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
507 * Returns 0 on success (decompression function returned non-negative)
508 * and non-zero on failure (decompression function returned negative.
510 if (ZSTD_isError(result)) {
511 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
516 *level = hdr_copy.level;
522 /* Decompress datablock using zstd */
524 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
525 int level __maybe_unused)
528 return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
532 /* Allocator for zstd compression context using mempool_allocator */
534 zstd_alloc(void *opaque __maybe_unused, size_t size)
536 size_t nbytes = sizeof (struct zstd_kmem) + size;
537 struct zstd_kmem *z = NULL;
539 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
542 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
546 return ((void*)z + (sizeof (struct zstd_kmem)));
550 * Allocator for zstd decompression context using mempool_allocator with
551 * fallback to reserved memory if allocation fails
554 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
556 size_t nbytes = sizeof (struct zstd_kmem) + size;
557 struct zstd_kmem *z = NULL;
558 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
560 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
562 /* Try harder, decompression shall not fail */
563 z = vmem_alloc(nbytes, KM_SLEEP);
567 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
569 return ((void*)z + (sizeof (struct zstd_kmem)));
572 /* Fallback if everything fails */
575 * Barrier since we only can handle it in a single thread. All
576 * other following threads need to wait here until decompression
577 * is completed. zstd_free will release this barrier later.
579 mutex_enter(&zstd_dctx_fallback.barrier);
581 z = zstd_dctx_fallback.mem;
582 type = ZSTD_KMEM_DCTX;
583 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
586 /* Allocation should always be successful */
592 z->kmem_size = nbytes;
594 return ((void*)z + (sizeof (struct zstd_kmem)));
597 /* Free allocated memory by its specific type */
599 zstd_free(void *opaque __maybe_unused, void *ptr)
601 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
602 enum zstd_kmem_type type;
604 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
605 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
609 case ZSTD_KMEM_DEFAULT:
610 vmem_free(z, z->kmem_size);
613 zstd_mempool_free(z);
616 mutex_exit(&zstd_dctx_fallback.barrier);
623 /* Allocate fallback memory to ensure safe decompression */
625 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
627 mem->mem_size = size;
628 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
629 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
632 /* Initialize memory pool barrier mutexes */
634 zstd_mempool_init(void)
636 zstd_mempool_cctx = (struct zstd_pool *)
637 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
638 zstd_mempool_dctx = (struct zstd_pool *)
639 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
641 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
642 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
643 MUTEX_DEFAULT, NULL);
644 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
645 MUTEX_DEFAULT, NULL);
649 /* Initialize zstd-related memory handling */
656 * Estimate the size of the fallback decompression context.
657 * The expected size on x64 with current ZSTD should be about 160 KB.
659 create_fallback_mem(&zstd_dctx_fallback,
660 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
666 /* Release object from pool and free memory */
668 release_pool(struct zstd_pool *pool)
670 mutex_destroy(&pool->barrier);
671 vmem_free(pool->mem, pool->size);
676 /* Release memory pool objects */
678 zstd_mempool_deinit(void)
680 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
681 release_pool(&zstd_mempool_cctx[i]);
682 release_pool(&zstd_mempool_dctx[i]);
685 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
686 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
687 zstd_mempool_dctx = NULL;
688 zstd_mempool_cctx = NULL;
691 /* release unused memory from pool */
694 zfs_zstd_cache_reap_now(void)
697 * calling alloc with zero size seeks
698 * and releases old unused objects
700 zstd_mempool_alloc(zstd_mempool_cctx, 0);
701 zstd_mempool_alloc(zstd_mempool_dctx, 0);
707 /* Set pool size by using maximum sane thread count * 4 */
708 pool_count = (boot_ncpus * 4);
711 /* Initialize kstat */
712 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
713 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
715 if (zstd_ksp != NULL) {
716 zstd_ksp->ks_data = &zstd_stats;
717 kstat_install(zstd_ksp);
726 /* Deinitialize kstat */
727 if (zstd_ksp != NULL) {
728 kstat_delete(zstd_ksp);
732 /* Release fallback memory */
733 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
734 mutex_destroy(&zstd_dctx_fallback.barrier);
736 /* Deinit memory pool */
737 zstd_mempool_deinit();
741 module_init(zstd_init);
742 module_exit(zstd_fini);
744 ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
745 ZFS_MODULE_LICENSE("Dual BSD/GPL");
746 ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
748 EXPORT_SYMBOL(zfs_zstd_compress);
749 EXPORT_SYMBOL(zfs_zstd_decompress_level);
750 EXPORT_SYMBOL(zfs_zstd_decompress);
751 EXPORT_SYMBOL(zfs_zstd_cache_reap_now);