1 /* $NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $ */
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 /* Lzd - Educational decompressor for the lzip format
33 Copyright (C) 2013-2018 Antonio Diaz Diaz.
35 This program is free software. Redistribution and use in source and
36 binary forms, with or without modification, are permitted provided
37 that the following conditions are met:
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
46 This program is distributed in the hope that it will be useful,
47 but WITHOUT ANY WARRANTY; without even the implied warranty of
48 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
51 #include <sys/param.h>
62 #define LITERAL_CONTEXT_BITS 3
63 #define POS_STATE_BITS 2
64 #define POS_STATES (1 << POS_STATE_BITS)
65 #define POS_STATE_MASK (POS_STATES - 1)
68 #define DIS_SLOT_BITS 6
70 #define DIS_MODEL_START 4
71 #define DIS_MODEL_END 14
73 #define MODELED_DISTANCES (1 << (DIS_MODEL_END / 2))
74 #define DIS_ALIGN_BITS 4
75 #define DIS_ALIGN_SIZE (1 << DIS_ALIGN_BITS)
81 #define LOW_SYMBOLS (1 << LOW_BITS)
82 #define MID_SYMBOLS (1 << MID_BITS)
83 #define HIGH_SYMBOLS (1 << HIGH_BITS)
85 #define MAX_SYMBOLS (LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
87 #define MIN_MATCH_LEN 2
89 #define BIT_MODEL_MOVE_BITS 5
90 #define BIT_MODEL_TOTAL_BITS 11
91 #define BIT_MODEL_TOTAL (1 << BIT_MODEL_TOTAL_BITS)
92 #define BIT_MODEL_INIT (BIT_MODEL_TOTAL / 2)
94 static const int lz_st_next[] = {
95 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
99 lz_st_is_char(int st) {
104 lz_st_get_char(int st) {
105 return lz_st_next[st];
109 lz_st_get_match(int st) {
110 return st < 7 ? 7 : 10;
114 lz_st_get_rep(int st) {
115 return st < 7 ? 8 : 11;
119 lz_st_get_short_rep(int st) {
120 return st < 7 ? 9 : 11;
123 struct lz_len_model {
126 int bm_low[POS_STATES][LOW_SYMBOLS];
127 int bm_mid[POS_STATES][MID_SYMBOLS];
128 int bm_high[HIGH_SYMBOLS];
131 static uint32_t lz_crc[256];
136 for (unsigned i = 0; i < nitems(lz_crc); i++) {
138 for (unsigned j = 0; j < 8; j++) {
140 c = 0xEDB88320U ^ (c >> 1);
149 lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
151 for (size_t i = 0; i < len; i++)
152 *crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
155 struct lz_range_decoder {
162 lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
167 for (int i = 0; i < 5; i++)
168 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
169 return ferror(rd->fp) ? -1 : 0;
173 lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
177 for (int i = num_bits; i > 0; i--) {
180 if (rd->code >= rd->range) {
181 rd->code -= rd->range;
184 if (rd->range <= 0x00FFFFFFU) {
186 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
194 lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
197 const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
199 if(rd->code < bound) {
201 *bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
207 *bm -= *bm >> BIT_MODEL_MOVE_BITS;
211 if (rd->range <= 0x00FFFFFFU) {
213 rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
219 lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
223 for (int i = 0; i < num_bits; i++)
224 symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
226 return symbol - (1 << num_bits);
230 lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
232 unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
233 unsigned reversed_symbol = 0;
235 for (int i = 0; i < num_bits; i++) {
236 reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
240 return reversed_symbol;
244 lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
248 for (int i = 7; i >= 0; i--) {
249 const unsigned match_bit = (match_byte >> i) & 1;
250 const unsigned bit = lz_rd_decode_bit(rd,
251 &bm[symbol + (match_bit << 8) + 0x100]);
252 symbol = (symbol << 1) | bit;
253 if (match_bit != bit) {
254 while (symbol < 0x100) {
255 symbol = (symbol << 1) |
256 lz_rd_decode_bit(rd, &bm[symbol]);
261 return symbol & 0xFF;
265 lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
268 if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
269 return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
271 if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
273 lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
276 return LOW_SYMBOLS + MID_SYMBOLS +
277 lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
282 off_t pos, ppos, spos, dict_size;
286 struct lz_range_decoder rdec;
290 lz_flush(struct lz_decoder *lz)
292 off_t offs = lz->pos - lz->spos;
296 size_t size = (size_t)offs;
297 lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
298 if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
301 lz->wrapped = lz->pos >= lz->dict_size;
311 lz_destroy(struct lz_decoder *lz)
321 lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
323 memset(lz, 0, sizeof(*lz));
325 lz->fin = fdopen(dup(fin), "r");
329 lz->fout = fdopen(dup(fdout), "w");
330 if (lz->fout == NULL)
333 lz->pos = lz->ppos = lz->spos = 0;
335 lz->dict_size = dict_size;
338 lz->obuf = malloc(dict_size);
339 if (lz->obuf == NULL)
342 if (lz_rd_create(&lz->rdec, lz->fin) == -1)
351 lz_peek(const struct lz_decoder *lz, unsigned ahead)
353 off_t diff = lz->pos - ahead - 1;
356 return lz->obuf[diff];
359 return lz->obuf[lz->dict_size + diff];
365 lz_put(struct lz_decoder *lz, uint8_t b)
367 lz->obuf[lz->pos++] = b;
368 if (lz->dict_size == lz->pos)
373 lz_get_data_position(const struct lz_decoder *lz)
375 return lz->ppos + lz->pos;
379 lz_get_crc(const struct lz_decoder *lz)
381 return lz->crc ^ 0xffffffffU;
385 lz_bm_init(int *a, size_t l)
387 for (size_t i = 0; i < l; i++)
388 a[i] = BIT_MODEL_INIT;
391 #define LZ_BM_INIT(a) lz_bm_init(a, nitems(a))
392 #define LZ_BM_INIT2(a) do { \
393 size_t l = nitems(a[0]); \
394 for (size_t i = 0; i < nitems(a); i++) \
395 lz_bm_init(a[i], l); \
396 } while (/*CONSTCOND*/0)
398 #define LZ_MODEL_INIT(a) do { \
399 a.choice1 = BIT_MODEL_INIT; \
400 a.choice2 = BIT_MODEL_INIT; \
401 LZ_BM_INIT2(a.bm_low); \
402 LZ_BM_INIT2(a.bm_mid); \
403 LZ_BM_INIT(a.bm_high); \
404 } while (/*CONSTCOND*/0)
407 lz_decode_member(struct lz_decoder *lz)
409 int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
410 int bm_match[LZ_STATES][POS_STATES];
411 int bm_rep[4][LZ_STATES];
412 int bm_len[LZ_STATES][POS_STATES];
413 int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
414 int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
415 int bm_align[DIS_ALIGN_SIZE];
417 LZ_BM_INIT2(bm_literal);
418 LZ_BM_INIT2(bm_match);
421 LZ_BM_INIT2(bm_dis_slot);
423 LZ_BM_INIT(bm_align);
425 struct lz_len_model match_len_model;
426 struct lz_len_model rep_len_model;
428 LZ_MODEL_INIT(match_len_model);
429 LZ_MODEL_INIT(rep_len_model);
431 struct lz_range_decoder *rd = &lz->rdec;
432 unsigned rep[4] = { 0 };
437 while (!feof(lz->fin) && !ferror(lz->fin)) {
438 const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
440 if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
441 const uint8_t prev_byte = lz_peek(lz, 0);
442 const int literal_state =
443 prev_byte >> (8 - LITERAL_CONTEXT_BITS);
444 int *bm = bm_literal[literal_state];
445 if (lz_st_is_char(state))
446 lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
448 int peek = lz_peek(lz, rep[0]);
449 lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
451 state = lz_st_get_char(state);
456 if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
458 if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
460 if (lz_rd_decode_bit(rd,
461 &bm_len[state][pos_state]) == 0)
463 state = lz_st_get_short_rep(state);
464 lz_put(lz, lz_peek(lz, rep[0]));
470 if (lz_rd_decode_bit(rd, &bm_rep[2][state])
475 if (lz_rd_decode_bit(rd,
476 &bm_rep[3][state]) == 0)
487 state = lz_st_get_rep(state);
488 len = MIN_MATCH_LEN +
489 lz_rd_decode_len(rd, &rep_len_model, pos_state);
491 rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
492 len = MIN_MATCH_LEN +
493 lz_rd_decode_len(rd, &match_len_model, pos_state);
494 const int len_state =
495 MIN(len - MIN_MATCH_LEN, STATES - 1);
496 rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
498 if (rep[0] >= DIS_MODEL_START) {
499 const unsigned dis_slot = rep[0];
500 const int direct_bits = (dis_slot >> 1) - 1;
501 rep[0] = (2 | (dis_slot & 1)) << direct_bits;
502 if (dis_slot < DIS_MODEL_END)
503 rep[0] += lz_rd_decode_tree_reversed(rd,
504 &bm_dis[rep[0] - dis_slot],
507 rep[0] += lz_rd_decode(rd, direct_bits
508 - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
509 rep[0] += lz_rd_decode_tree_reversed(rd,
510 bm_align, DIS_ALIGN_BITS);
511 if (rep[0] == 0xFFFFFFFFU) {
513 return len == MIN_MATCH_LEN;
517 state = lz_st_get_match(state);
518 if (rep[0] >= lz->dict_size ||
519 (rep[0] >= lz->pos && !lz->wrapped)) {
524 for (int i = 0; i < len; i++)
525 lz_put(lz, lz_peek(lz, rep[0]));
532 * 0-3 CRC32 of the uncompressed data
533 * 4-11 size of the uncompressed data
534 * 12-19 member size including header and trailer
536 #define TRAILER_SIZE 20
540 lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
542 struct lz_decoder lz;
545 if (lz_create(&lz, fin, fdout, dict_size) == -1)
548 if (!lz_decode_member(&lz))
551 uint8_t trailer[TRAILER_SIZE];
553 for(size_t i = 0; i < nitems(trailer); i++)
554 trailer[i] = (uint8_t)getc(lz.fin);
557 for (int i = 3; i >= 0; --i) {
562 int64_t data_size = 0;
563 for (int i = 11; i >= 4; --i) {
565 data_size += trailer[i];
568 if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
572 for (int i = 19; i >= 12; --i) {
579 /* Does not work with pipes */
580 rv = ftello(lz.fout);
596 #define MIN_DICTIONARY_SIZE (1 << 12)
597 #define MAX_DICTIONARY_SIZE (1 << 29)
599 static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
602 lz_get_dict_size(unsigned char c)
604 unsigned dict_size = 1 << (c & 0x1f);
605 dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
606 if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
612 unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
617 char header[HDR_SIZE];
620 memcpy(header, pre, prelen);
622 ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
627 return prelen ? -1 : 0;
629 if ((size_t)nr != sizeof(header) - prelen)
634 if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
637 unsigned dict_size = lz_get_dict_size(header[5]);
641 return lz_decode(fin, fout, dict_size, bytes_in);