2 * Copyright (c) 2014 Sebastian Freundt
3 * Author: Sebastian Freundt <devel@fresse.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "archive_platform.h"
29 __FBSDID("$FreeBSD$");
46 #include "archive_entry.h"
47 #include "archive_entry_locale.h"
48 #include "archive_private.h"
49 #include "archive_random_private.h"
50 #include "archive_write_private.h"
51 #include "archive_write_set_format_private.h"
54 unsigned int omit_warcinfo:1;
63 static const char warcinfo[] =
64 "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
65 "format: WARC file version 1.0\r\n";
75 /* request, unsupported */
77 /* response, unsupported */
79 /* revisit, unsupported */
81 /* conversion, unsupported */
83 /* continuation, unsupported at the moment */
97 } warc_essential_hdr_t;
103 static int _warc_options(struct archive_write*, const char *key, const char *v);
104 static int _warc_header(struct archive_write *a, struct archive_entry *entry);
105 static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
106 static int _warc_finish_entry(struct archive_write *a);
107 static int _warc_close(struct archive_write *a);
108 static int _warc_free(struct archive_write *a);
110 /* private routines */
111 static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
112 static int _gen_uuid(warc_uuid_t *tgt);
116 * Set output format to ISO 28500 (aka WARC) format.
119 archive_write_set_format_warc(struct archive *_a)
121 struct archive_write *a = (struct archive_write *)_a;
124 archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
125 ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
127 /* If another format was already registered, unregister it. */
128 if (a->format_free != NULL) {
132 w = malloc(sizeof(*w));
134 archive_set_error(&a->archive, ENOMEM,
135 "Can't allocate warc data");
136 return (ARCHIVE_FATAL);
138 /* by default we're emitting a file wide header */
139 w->omit_warcinfo = 0U;
140 /* obtain current time for date fields */
142 /* reset file type info */
144 /* also initialise our rng */
145 w->rng = (unsigned int)w->now;
148 a->format_name = "WARC/1.0";
149 a->format_options = _warc_options;
150 a->format_write_header = _warc_header;
151 a->format_write_data = _warc_data;
152 a->format_close = _warc_close;
153 a->format_free = _warc_free;
154 a->format_finish_entry = _warc_finish_entry;
155 a->archive.archive_format = ARCHIVE_FORMAT_WARC;
156 a->archive.archive_format_name = "WARC/1.0";
161 /* archive methods */
163 _warc_options(struct archive_write *a, const char *key, const char *val)
165 struct warc_s *w = a->format_data;
167 if (strcmp(key, "omit-warcinfo") == 0) {
168 if (val == NULL || strcmp(val, "true") == 0) {
170 w->omit_warcinfo = 1U;
175 /* Note: The "warn" return is just to inform the options
176 * supervisor that we didn't handle it. It will generate
177 * a suitable error if no one used this option. */
178 return (ARCHIVE_WARN);
182 _warc_header(struct archive_write *a, struct archive_entry *entry)
184 struct warc_s *w = a->format_data;
185 struct archive_string hdr;
186 #define MAX_HDR_SIZE 512
188 /* check whether warcinfo record needs outputting */
189 if (!w->omit_warcinfo) {
191 warc_essential_hdr_t wi = {
197 /*cty*/"application/warc-fields",
198 /*len*/sizeof(warcinfo) - 1U,
203 archive_string_init(&hdr);
204 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
207 /* now also use HDR buffer for the actual warcinfo */
208 archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
210 /* append end-of-record indicator */
211 archive_strncat(&hdr, "\r\n\r\n", 4);
213 /* write to output stream */
214 __archive_write_output(a, hdr.s, archive_strlen(&hdr));
216 /* indicate we're done with file header writing */
217 w->omit_warcinfo = 1U;
218 archive_string_free(&hdr);
221 if (archive_entry_pathname(entry) == NULL) {
222 archive_set_error(&a->archive, EINVAL,
224 return (ARCHIVE_WARN);
227 w->typ = archive_entry_filetype(entry);
229 if (w->typ == AE_IFREG) {
230 warc_essential_hdr_t rh = {
240 rh.tgturi = archive_entry_pathname(entry);
242 rh.mtime = archive_entry_mtime(entry);
243 rh.cntlen = (size_t)archive_entry_size(entry);
245 archive_string_init(&hdr);
246 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
251 ARCHIVE_ERRNO_FILE_FORMAT,
252 "cannot archive file");
253 return (ARCHIVE_WARN);
255 /* otherwise append to output stream */
256 __archive_write_output(a, hdr.s, r);
257 /* and let subsequent calls to _data() know about the size */
258 w->populz = rh.cntlen;
259 archive_string_free(&hdr);
262 /* just resort to erroring as per Tim's advice */
263 __archive_write_entry_filetype_unsupported(
264 &a->archive, entry, "WARC");
265 return (ARCHIVE_FAILED);
269 _warc_data(struct archive_write *a, const void *buf, size_t len)
271 struct warc_s *w = a->format_data;
273 if (w->typ == AE_IFREG) {
276 /* never write more bytes than announced */
277 if (len > w->populz) {
278 len = (size_t)w->populz;
281 /* now then, out we put the whole shebang */
282 rc = __archive_write_output(a, buf, len);
283 if (rc != ARCHIVE_OK) {
291 _warc_finish_entry(struct archive_write *a)
293 static const char _eor[] = "\r\n\r\n";
294 struct warc_s *w = a->format_data;
296 if (w->typ == AE_IFREG) {
297 int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
299 if (rc != ARCHIVE_OK) {
303 /* reset type info */
309 _warc_close(struct archive_write *a)
311 (void)a; /* UNUSED */
316 _warc_free(struct archive_write *a)
318 struct warc_s *w = a->format_data;
321 a->format_data = NULL;
326 /* private routines */
328 xstrftime(struct archive_string *as, const char *fmt, time_t t)
330 /** like strftime(3) but for time_t objects */
332 #if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
335 #if defined(HAVE__GMTIME64_S)
343 if ((rt = gmtime_r(&t, &timeHere)) == NULL)
345 #elif defined(HAVE__GMTIME64_S)
347 terr = _gmtime64_s(&timeHere, &tmptime);
353 if ((rt = gmtime(&t)) == NULL)
356 /* leave the hard yacker to our role model strftime() */
357 len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
358 archive_strncat(as, strtime, len);
362 _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
364 static const char _ver[] = "WARC/1.0\r\n";
365 static const char * const _typ[LAST_WT] = {
366 NULL, "warcinfo", "metadata", "resource", NULL
370 if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
371 /* brilliant, how exactly did we get here? */
375 archive_strcpy(tgt, _ver);
377 archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
379 if (hdr.tgturi != NULL) {
380 /* check if there's a xyz:// */
381 static const char _uri[] = "";
382 static const char _fil[] = "file://";
384 char *chk = strchr(hdr.tgturi, ':');
386 if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
387 /* yep, it's definitely a URI */
390 /* hm, best to prepend file:// then */
393 archive_string_sprintf(tgt,
394 "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
397 /* record time is usually when the http is sent off,
398 * just treat the archive writing as such for a moment */
399 xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
401 /* while we're at it, record the mtime */
402 xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
404 if (hdr.recid == NULL) {
405 /* generate one, grrrr */
409 /* Unfortunately, archive_string_sprintf does not
410 * handle the minimum number following '%'.
411 * So we have to use snprintf function here instead
412 * of archive_string_snprintf function. */
413 #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
414 #define snprintf _snprintf
417 std_uuid, sizeof(std_uuid),
418 "<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
420 u.u[1U] >> 16U, u.u[1U] & 0xffffU,
421 u.u[2U] >> 16U, u.u[2U] & 0xffffU,
423 hdr.recid = std_uuid;
426 /* record-id is mandatory, fingers crossed we won't fail */
427 archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
429 if (hdr.cnttyp != NULL) {
430 archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
433 /* next one is mandatory */
434 archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
436 archive_strncat(tgt, "\r\n", 2);
438 return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
442 _gen_uuid(warc_uuid_t *tgt)
444 archive_random(tgt->u, sizeof(tgt->u));
445 /* obey uuid version 4 rules */
446 tgt->u[1U] &= 0xffff0fffU;
447 tgt->u[1U] |= 0x4000U;
448 tgt->u[2U] &= 0x3fffffffU;
449 tgt->u[2U] |= 0x80000000U;
453 /* archive_write_set_format_warc.c ends here */