1 /* $NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $ */
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
34 /* Lzd - Educational decompressor for the lzip format
35 Copyright (C) 2013-2018 Antonio Diaz Diaz.
37 This program is free software. Redistribution and use in source and
38 binary forms, with or without modification, are permitted provided
39 that the following conditions are met:
41 1. Redistributions of source code must retain the above copyright
42 notice, this list of conditions and the following disclaimer.
44 2. Redistributions in binary form must reproduce the above copyright
45 notice, this list of conditions and the following disclaimer in the
46 documentation and/or other materials provided with the distribution.
48 This program is distributed in the hope that it will be useful,
49 but WITHOUT ANY WARRANTY; without even the implied warranty of
50 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
53 #include <sys/param.h>
64 #define LITERAL_CONTEXT_BITS 3
65 #define POS_STATE_BITS 2
66 #define POS_STATES (1 << POS_STATE_BITS)
67 #define POS_STATE_MASK (POS_STATES - 1)
70 #define DIS_SLOT_BITS 6
72 #define DIS_MODEL_START 4
73 #define DIS_MODEL_END 14
75 #define MODELED_DISTANCES (1 << (DIS_MODEL_END / 2))
76 #define DIS_ALIGN_BITS 4
77 #define DIS_ALIGN_SIZE (1 << DIS_ALIGN_BITS)
83 #define LOW_SYMBOLS (1 << LOW_BITS)
84 #define MID_SYMBOLS (1 << MID_BITS)
85 #define HIGH_SYMBOLS (1 << HIGH_BITS)
87 #define MAX_SYMBOLS (LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
89 #define MIN_MATCH_LEN 2
91 #define BIT_MODEL_MOVE_BITS 5
92 #define BIT_MODEL_TOTAL_BITS 11
93 #define BIT_MODEL_TOTAL (1 << BIT_MODEL_TOTAL_BITS)
94 #define BIT_MODEL_INIT (BIT_MODEL_TOTAL / 2)
96 static const int lz_st_next[] = {
97 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
101 lz_st_is_char(int st) {
106 lz_st_get_char(int st) {
107 return lz_st_next[st];
111 lz_st_get_match(int st) {
112 return st < 7 ? 7 : 10;
116 lz_st_get_rep(int st) {
117 return st < 7 ? 8 : 11;
121 lz_st_get_short_rep(int st) {
122 return st < 7 ? 9 : 11;
125 struct lz_len_model {
128 int bm_low[POS_STATES][LOW_SYMBOLS];
129 int bm_mid[POS_STATES][MID_SYMBOLS];
130 int bm_high[HIGH_SYMBOLS];
133 static uint32_t lz_crc[256];
138 for (unsigned i = 0; i < nitems(lz_crc); i++) {
140 for (unsigned j = 0; j < 8; j++) {
142 c = 0xEDB88320U ^ (c >> 1);
151 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
153 for (size_t i = 0; i < len; i++)
154 *crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
157 struct lz_range_decoder {
164 lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
169 for (int i = 0; i < 5; i++)
170 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
171 return ferror(rd->fp) ? -1 : 0;
175 lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
179 for (int i = num_bits; i > 0; i--) {
182 if (rd->code >= rd->range) {
183 rd->code -= rd->range;
186 if (rd->range <= 0x00FFFFFFU) {
188 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
196 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
199 const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
201 if(rd->code < bound) {
203 *bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
209 *bm -= *bm >> BIT_MODEL_MOVE_BITS;
213 if (rd->range <= 0x00FFFFFFU) {
215 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
221 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
225 for (int i = 0; i < num_bits; i++)
226 symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
228 return symbol - (1 << num_bits);
232 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
234 unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
235 unsigned reversed_symbol = 0;
237 for (int i = 0; i < num_bits; i++) {
238 reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
242 return reversed_symbol;
246 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
250 for (int i = 7; i >= 0; i--) {
251 const unsigned match_bit = (match_byte >> i) & 1;
252 const unsigned bit = lz_rd_decode_bit(rd,
253 &bm[symbol + (match_bit << 8) + 0x100]);
254 symbol = (symbol << 1) | bit;
255 if (match_bit != bit) {
256 while (symbol < 0x100) {
257 symbol = (symbol << 1) |
258 lz_rd_decode_bit(rd, &bm[symbol]);
263 return symbol & 0xFF;
267 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
270 if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
271 return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
273 if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
275 lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
278 return LOW_SYMBOLS + MID_SYMBOLS +
279 lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
284 off_t pos, ppos, spos, dict_size;
288 struct lz_range_decoder rdec;
292 lz_flush(struct lz_decoder *lz)
294 off_t offs = lz->pos - lz->spos;
298 size_t size = (size_t)offs;
299 lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
300 if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
303 lz->wrapped = lz->pos >= lz->dict_size;
313 lz_destroy(struct lz_decoder *lz)
323 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
325 memset(lz, 0, sizeof(*lz));
327 lz->fin = fdopen(dup(fin), "r");
331 lz->fout = fdopen(dup(fdout), "w");
332 if (lz->fout == NULL)
335 lz->pos = lz->ppos = lz->spos = 0;
337 lz->dict_size = dict_size;
340 lz->obuf = malloc(dict_size);
341 if (lz->obuf == NULL)
344 if (lz_rd_create(&lz->rdec, lz->fin) == -1)
353 lz_peek(const struct lz_decoder *lz, unsigned ahead)
355 off_t diff = lz->pos - ahead - 1;
358 return lz->obuf[diff];
361 return lz->obuf[lz->dict_size + diff];
367 lz_put(struct lz_decoder *lz, uint8_t b)
369 lz->obuf[lz->pos++] = b;
370 if (lz->dict_size == lz->pos)
375 lz_get_data_position(const struct lz_decoder *lz)
377 return lz->ppos + lz->pos;
381 lz_get_crc(const struct lz_decoder *lz)
383 return lz->crc ^ 0xffffffffU;
387 lz_bm_init(int *a, size_t l)
389 for (size_t i = 0; i < l; i++)
390 a[i] = BIT_MODEL_INIT;
393 #define LZ_BM_INIT(a) lz_bm_init(a, nitems(a))
394 #define LZ_BM_INIT2(a) do { \
395 size_t l = nitems(a[0]); \
396 for (size_t i = 0; i < nitems(a); i++) \
397 lz_bm_init(a[i], l); \
398 } while (/*CONSTCOND*/0)
400 #define LZ_MODEL_INIT(a) do { \
401 a.choice1 = BIT_MODEL_INIT; \
402 a.choice2 = BIT_MODEL_INIT; \
403 LZ_BM_INIT2(a.bm_low); \
404 LZ_BM_INIT2(a.bm_mid); \
405 LZ_BM_INIT(a.bm_high); \
406 } while (/*CONSTCOND*/0)
409 lz_decode_member(struct lz_decoder *lz)
411 int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
412 int bm_match[LZ_STATES][POS_STATES];
413 int bm_rep[4][LZ_STATES];
414 int bm_len[LZ_STATES][POS_STATES];
415 int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
416 int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
417 int bm_align[DIS_ALIGN_SIZE];
419 LZ_BM_INIT2(bm_literal);
420 LZ_BM_INIT2(bm_match);
423 LZ_BM_INIT2(bm_dis_slot);
425 LZ_BM_INIT(bm_align);
427 struct lz_len_model match_len_model;
428 struct lz_len_model rep_len_model;
430 LZ_MODEL_INIT(match_len_model);
431 LZ_MODEL_INIT(rep_len_model);
433 struct lz_range_decoder *rd = &lz->rdec;
434 unsigned rep[4] = { 0 };
439 while (!feof(lz->fin) && !ferror(lz->fin)) {
440 const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
442 if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
443 const uint8_t prev_byte = lz_peek(lz, 0);
444 const int literal_state =
445 prev_byte >> (8 - LITERAL_CONTEXT_BITS);
446 int *bm = bm_literal[literal_state];
447 if (lz_st_is_char(state))
448 lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
450 int peek = lz_peek(lz, rep[0]);
451 lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
453 state = lz_st_get_char(state);
458 if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
460 if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
462 if (lz_rd_decode_bit(rd,
463 &bm_len[state][pos_state]) == 0)
465 state = lz_st_get_short_rep(state);
466 lz_put(lz, lz_peek(lz, rep[0]));
472 if (lz_rd_decode_bit(rd, &bm_rep[2][state])
477 if (lz_rd_decode_bit(rd,
478 &bm_rep[3][state]) == 0)
489 state = lz_st_get_rep(state);
490 len = MIN_MATCH_LEN +
491 lz_rd_decode_len(rd, &rep_len_model, pos_state);
493 rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
494 len = MIN_MATCH_LEN +
495 lz_rd_decode_len(rd, &match_len_model, pos_state);
496 const int len_state =
497 MIN(len - MIN_MATCH_LEN, STATES - 1);
498 rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
500 if (rep[0] >= DIS_MODEL_START) {
501 const unsigned dis_slot = rep[0];
502 const int direct_bits = (dis_slot >> 1) - 1;
503 rep[0] = (2 | (dis_slot & 1)) << direct_bits;
504 if (dis_slot < DIS_MODEL_END)
505 rep[0] += lz_rd_decode_tree_reversed(rd,
506 &bm_dis[rep[0] - dis_slot],
509 rep[0] += lz_rd_decode(rd, direct_bits
510 - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
511 rep[0] += lz_rd_decode_tree_reversed(rd,
512 bm_align, DIS_ALIGN_BITS);
513 if (rep[0] == 0xFFFFFFFFU) {
515 return len == MIN_MATCH_LEN;
519 state = lz_st_get_match(state);
520 if (rep[0] >= lz->dict_size ||
521 (rep[0] >= lz->pos && !lz->wrapped)) {
526 for (int i = 0; i < len; i++)
527 lz_put(lz, lz_peek(lz, rep[0]));
534 * 0-3 CRC32 of the uncompressed data
535 * 4-11 size of the uncompressed data
536 * 12-19 member size including header and trailer
538 #define TRAILER_SIZE 20
542 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
544 struct lz_decoder lz;
547 if (lz_create(&lz, fin, fdout, dict_size) == -1)
550 if (!lz_decode_member(&lz))
553 uint8_t trailer[TRAILER_SIZE];
555 for(size_t i = 0; i < nitems(trailer); i++)
556 trailer[i] = (uint8_t)getc(lz.fin);
559 for (int i = 3; i >= 0; --i) {
564 int64_t data_size = 0;
565 for (int i = 11; i >= 4; --i) {
567 data_size += trailer[i];
570 if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
574 for (int i = 19; i >= 12; --i) {
581 /* Does not work with pipes */
582 rv = ftello(lz.fout);
598 #define MIN_DICTIONARY_SIZE (1 << 12)
599 #define MAX_DICTIONARY_SIZE (1 << 29)
601 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
604 lz_get_dict_size(unsigned char c)
606 unsigned dict_size = 1 << (c & 0x1f);
607 dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
608 if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
614 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
619 char header[HDR_SIZE];
621 if (prelen > sizeof(header))
624 memcpy(header, pre, prelen);
626 ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
631 return prelen ? -1 : 0;
633 if ((size_t)nr != sizeof(header) - prelen)
638 if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
641 unsigned dict_size = lz_get_dict_size(header[5]);
645 return lz_decode(fin, fout, dict_size, bytes_in);