2 * Copyright (c) 2003-2007 Tim Kientzle
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 __FBSDID("$FreeBSD$");
31 * Pax interchange is supposed to encode filenames into
32 * UTF-8. Of course, that's not always possible. This
33 * test is intended to verify that filenames always get
34 * stored and restored correctly, regardless of the encodings.
38 * Read a manually-created archive that has filenames that are
39 * stored in binary instead of UTF-8 and verify that we get
40 * the right filename returned and that we get a warning only
41 * if the header isn't marked as binary.
44 test_pax_filename_encoding_1(void)
46 static const char testname[] = "test_pax_filename_encoding.tar";
48 * \314\214 is a valid 2-byte UTF-8 sequence.
49 * \374 is invalid in UTF-8.
51 char filename[] = "abc\314\214mno\374xyz";
53 struct archive_entry *entry;
56 * Read an archive that has non-UTF8 pax filenames in it.
58 extract_reference_file(testname);
59 a = archive_read_new();
60 assertEqualInt(ARCHIVE_OK, archive_read_support_format_tar(a));
61 assertEqualInt(ARCHIVE_OK, archive_read_support_filter_all(a));
62 assertEqualInt(ARCHIVE_OK,
63 archive_read_open_filename(a, testname, 10240));
65 * First entry in this test archive has an invalid UTF-8 sequence
66 * in it, but the header is not marked as hdrcharset=BINARY, so that
69 failure("Invalid UTF8 in a pax archive pathname should cause a warning");
70 assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
71 assertEqualString(filename, archive_entry_pathname(entry));
73 * Second entry is identical except that it does have
74 * hdrcharset=BINARY, so no warning should be generated.
76 failure("A pathname with hdrcharset=BINARY can have invalid UTF8\n"
77 " characters in it without generating a warning");
78 assertEqualInt(ARCHIVE_OK, archive_read_next_header(a, &entry));
79 assertEqualString(filename, archive_entry_pathname(entry));
84 * Set the locale and write a pathname containing invalid characters.
85 * This should work; the underlying implementation should automatically
86 * fall back to storing the pathname in binary.
89 test_pax_filename_encoding_2(void)
91 char filename[] = "abc\314\214mno\374xyz";
93 struct archive_entry *entry;
95 char longname[] = "abc\314\214mno\374xyz"
96 "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
97 "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
98 "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
99 "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
100 "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
101 "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
106 * We need a starting locale which has invalid sequences.
107 * en_US.UTF-8 seems to be commonly supported.
109 /* If it doesn't exist, just warn and return. */
110 if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
111 skipping("invalid encoding tests require a suitable locale;"
112 " en_US.UTF-8 not available on this system");
116 assert((a = archive_write_new()) != NULL);
117 assertEqualIntA(a, 0, archive_write_set_format_pax(a));
118 assertEqualIntA(a, 0, archive_write_add_filter_none(a));
119 assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
121 archive_write_open_memory(a, buff, sizeof(buff), &used));
123 assert((entry = archive_entry_new()) != NULL);
124 /* Set pathname, gname, uname, hardlink to nonconvertible values. */
125 archive_entry_copy_pathname(entry, filename);
126 archive_entry_copy_gname(entry, filename);
127 archive_entry_copy_uname(entry, filename);
128 archive_entry_copy_hardlink(entry, filename);
129 archive_entry_set_filetype(entry, AE_IFREG);
130 failure("This should generate a warning for nonconvertible names.");
131 assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
132 archive_entry_free(entry);
134 assert((entry = archive_entry_new()) != NULL);
135 /* Set path, gname, uname, and symlink to nonconvertible values. */
136 archive_entry_copy_pathname(entry, filename);
137 archive_entry_copy_gname(entry, filename);
138 archive_entry_copy_uname(entry, filename);
139 archive_entry_copy_symlink(entry, filename);
140 archive_entry_set_filetype(entry, AE_IFLNK);
141 failure("This should generate a warning for nonconvertible names.");
142 assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
143 archive_entry_free(entry);
145 assert((entry = archive_entry_new()) != NULL);
146 /* Set pathname to a very long nonconvertible value. */
147 archive_entry_copy_pathname(entry, longname);
148 archive_entry_set_filetype(entry, AE_IFREG);
149 failure("This should generate a warning for nonconvertible names.");
150 assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
151 archive_entry_free(entry);
153 assertEqualIntA(a, ARCHIVE_OK, archive_write_close(a));
154 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
157 * Now read the entries back.
160 assert((a = archive_read_new()) != NULL);
161 assertEqualInt(0, archive_read_support_format_tar(a));
162 assertEqualInt(0, archive_read_open_memory(a, buff, used));
164 assertEqualInt(0, archive_read_next_header(a, &entry));
165 assertEqualString(filename, archive_entry_pathname(entry));
166 assertEqualString(filename, archive_entry_gname(entry));
167 assertEqualString(filename, archive_entry_uname(entry));
168 assertEqualString(filename, archive_entry_hardlink(entry));
170 assertEqualInt(0, archive_read_next_header(a, &entry));
171 assertEqualString(filename, archive_entry_pathname(entry));
172 assertEqualString(filename, archive_entry_gname(entry));
173 assertEqualString(filename, archive_entry_uname(entry));
174 assertEqualString(filename, archive_entry_symlink(entry));
176 assertEqualInt(0, archive_read_next_header(a, &entry));
177 assertEqualString(longname, archive_entry_pathname(entry));
179 assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
180 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
183 #if 0 /* Disable this until Tim check out it. */
186 * Create an entry starting from a wide-character Unicode pathname,
187 * read it back into "C" locale, which doesn't support the name.
188 * TODO: Figure out the "right" behavior here.
191 test_pax_filename_encoding_3(void)
193 wchar_t badname[] = L"xxxAyyyBzzz";
194 const char badname_utf8[] = "xxx\xE1\x88\xB4yyy\xE5\x99\xB8zzz";
196 struct archive_entry *entry;
203 /* If it doesn't exist, just warn and return. */
204 if (NULL == setlocale(LC_ALL, "C")) {
205 skipping("Can't set \"C\" locale, so can't exercise "
206 "certain character-conversion failures");
210 /* If wctomb is broken, warn and return. */
211 if (wctomb(buff, 0x1234) > 0) {
212 skipping("Cannot test conversion failures because \"C\" "
213 "locale on this system has no invalid characters.");
217 /* If wctomb is broken, warn and return. */
218 if (wctomb(buff, 0x1234) > 0) {
219 skipping("Cannot test conversion failures because \"C\" "
220 "locale on this system has no invalid characters.");
224 /* Skip test if archive_entry_update_pathname_utf8() is broken. */
225 /* In particular, this is currently broken on Win32 because
226 * setlocale() does not set the default encoding for CP_ACP. */
227 entry = archive_entry_new();
228 if (archive_entry_update_pathname_utf8(entry, badname_utf8)) {
229 archive_entry_free(entry);
230 skipping("Cannot test conversion failures.");
233 archive_entry_free(entry);
235 assert((a = archive_write_new()) != NULL);
236 assertEqualIntA(a, 0, archive_write_set_format_pax(a));
237 assertEqualIntA(a, 0, archive_write_add_filter_none(a));
238 assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
240 archive_write_open_memory(a, buff, sizeof(buff), &used));
242 assert((entry = archive_entry_new()) != NULL);
243 /* Set pathname to non-convertible wide value. */
244 archive_entry_copy_pathname_w(entry, badname);
245 archive_entry_set_filetype(entry, AE_IFREG);
246 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
247 archive_entry_free(entry);
249 assert((entry = archive_entry_new()) != NULL);
250 archive_entry_copy_pathname_w(entry, L"abc");
251 /* Set gname to non-convertible wide value. */
252 archive_entry_copy_gname_w(entry, badname);
253 archive_entry_set_filetype(entry, AE_IFREG);
254 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
255 archive_entry_free(entry);
257 assert((entry = archive_entry_new()) != NULL);
258 archive_entry_copy_pathname_w(entry, L"abc");
259 /* Set uname to non-convertible wide value. */
260 archive_entry_copy_uname_w(entry, badname);
261 archive_entry_set_filetype(entry, AE_IFREG);
262 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
263 archive_entry_free(entry);
265 assert((entry = archive_entry_new()) != NULL);
266 archive_entry_copy_pathname_w(entry, L"abc");
267 /* Set hardlink to non-convertible wide value. */
268 archive_entry_copy_hardlink_w(entry, badname);
269 archive_entry_set_filetype(entry, AE_IFREG);
270 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
271 archive_entry_free(entry);
273 assert((entry = archive_entry_new()) != NULL);
274 archive_entry_copy_pathname_w(entry, L"abc");
275 /* Set symlink to non-convertible wide value. */
276 archive_entry_copy_symlink_w(entry, badname);
277 archive_entry_set_filetype(entry, AE_IFLNK);
278 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
279 archive_entry_free(entry);
281 assertEqualIntA(a, ARCHIVE_OK, archive_write_close(a));
282 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
285 * Now read the entries back.
288 assert((a = archive_read_new()) != NULL);
289 assertEqualInt(0, archive_read_support_format_tar(a));
290 assertEqualInt(0, archive_read_open_memory(a, buff, used));
292 failure("A non-convertible pathname should cause a warning.");
293 assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
294 assertEqualWString(badname, archive_entry_pathname_w(entry));
295 failure("If native locale can't convert, we should get UTF-8 back.");
296 assertEqualString(badname_utf8, archive_entry_pathname(entry));
298 failure("A non-convertible gname should cause a warning.");
299 assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
300 assertEqualWString(badname, archive_entry_gname_w(entry));
301 failure("If native locale can't convert, we should get UTF-8 back.");
302 assertEqualString(badname_utf8, archive_entry_gname(entry));
304 failure("A non-convertible uname should cause a warning.");
305 assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
306 assertEqualWString(badname, archive_entry_uname_w(entry));
307 failure("If native locale can't convert, we should get UTF-8 back.");
308 assertEqualString(badname_utf8, archive_entry_uname(entry));
310 failure("A non-convertible hardlink should cause a warning.");
311 assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
312 assertEqualWString(badname, archive_entry_hardlink_w(entry));
313 failure("If native locale can't convert, we should get UTF-8 back.");
314 assertEqualString(badname_utf8, archive_entry_hardlink(entry));
316 failure("A non-convertible symlink should cause a warning.");
317 assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
318 assertEqualWString(badname, archive_entry_symlink_w(entry));
319 assertEqualWString(NULL, archive_entry_hardlink_w(entry));
320 failure("If native locale can't convert, we should get UTF-8 back.");
321 assertEqualString(badname_utf8, archive_entry_symlink(entry));
323 assertEqualInt(ARCHIVE_EOF, archive_read_next_header(a, &entry));
325 assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
326 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
330 test_pax_filename_encoding_3(void)
336 * Verify that KOI8-R filenames are correctly translated to Unicode and UTF-8.
339 test_pax_filename_encoding_KOI8R(void)
342 struct archive_entry *entry;
346 if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
347 skipping("KOI8-R locale not available on this system.");
351 /* Check if the paltform completely supports the string conversion. */
352 a = archive_write_new();
353 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
354 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
355 skipping("This system cannot convert character-set"
356 " from KOI8-R to UTF-8.");
357 archive_write_free(a);
360 archive_write_free(a);
362 /* Re-create a write archive object since filenames should be written
363 * in UTF-8 by default. */
364 a = archive_write_new();
365 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
366 assertEqualInt(ARCHIVE_OK,
367 archive_write_open_memory(a, buff, sizeof(buff), &used));
369 entry = archive_entry_new2(a);
370 archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
371 archive_entry_set_filetype(entry, AE_IFREG);
372 archive_entry_set_size(entry, 0);
373 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
374 archive_entry_free(entry);
375 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
377 /* Above three characters in KOI8-R should translate to the following
378 * three characters (two bytes each) in UTF-8. */
379 assertEqualMem(buff + 512, "15 path=\xD0\xBF\xD1\x80\xD0\xB8\x0A", 15);
383 * Verify that CP1251 filenames are correctly translated to Unicode and UTF-8.
386 test_pax_filename_encoding_CP1251(void)
389 struct archive_entry *entry;
393 if (NULL == setlocale(LC_ALL, "Russian_Russia") &&
394 NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
395 skipping("KOI8-R locale not available on this system.");
399 /* Check if the paltform completely supports the string conversion. */
400 a = archive_write_new();
401 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
402 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
403 skipping("This system cannot convert character-set"
404 " from KOI8-R to UTF-8.");
405 archive_write_free(a);
408 archive_write_free(a);
410 /* Re-create a write archive object since filenames should be written
411 * in UTF-8 by default. */
412 a = archive_write_new();
413 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
414 assertEqualInt(ARCHIVE_OK,
415 archive_write_open_memory(a, buff, sizeof(buff), &used));
417 entry = archive_entry_new2(a);
418 archive_entry_set_pathname(entry, "\xef\xf0\xe8");
419 archive_entry_set_filetype(entry, AE_IFREG);
420 archive_entry_set_size(entry, 0);
421 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
422 archive_entry_free(entry);
423 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
425 /* Above three characters in KOI8-R should translate to the following
426 * three characters (two bytes each) in UTF-8. */
427 assertEqualMem(buff + 512, "15 path=\xD0\xBF\xD1\x80\xD0\xB8\x0A", 15);
431 * Verify that EUC-JP filenames are correctly translated to Unicode and UTF-8.
434 test_pax_filename_encoding_EUCJP(void)
437 struct archive_entry *entry;
441 if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
442 skipping("eucJP locale not available on this system.");
446 /* Check if the paltform completely supports the string conversion. */
447 a = archive_write_new();
448 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
449 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
450 skipping("This system cannot convert character-set"
451 " from eucJP to UTF-8.");
452 archive_write_free(a);
455 archive_write_free(a);
457 /* Re-create a write archive object since filenames should be written
458 * in UTF-8 by default. */
459 a = archive_write_new();
460 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
461 assertEqualInt(ARCHIVE_OK,
462 archive_write_open_memory(a, buff, sizeof(buff), &used));
464 entry = archive_entry_new2(a);
465 archive_entry_set_pathname(entry, "\xC9\xBD.txt");
466 /* Check the Unicode version. */
467 archive_entry_set_filetype(entry, AE_IFREG);
468 archive_entry_set_size(entry, 0);
469 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
470 archive_entry_free(entry);
471 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
473 /* Check UTF-8 version. */
474 assertEqualMem(buff + 512, "16 path=\xE8\xA1\xA8.txt\x0A", 16);
479 * Verify that CP932/SJIS filenames are correctly translated to Unicode and UTF-8.
482 test_pax_filename_encoding_CP932(void)
485 struct archive_entry *entry;
489 if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
490 NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
491 skipping("eucJP locale not available on this system.");
495 /* Check if the paltform completely supports the string conversion. */
496 a = archive_write_new();
497 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
498 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
499 skipping("This system cannot convert character-set"
500 " from CP932/SJIS to UTF-8.");
501 archive_write_free(a);
504 archive_write_free(a);
506 /* Re-create a write archive object since filenames should be written
507 * in UTF-8 by default. */
508 a = archive_write_new();
509 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
510 assertEqualInt(ARCHIVE_OK,
511 archive_write_open_memory(a, buff, sizeof(buff), &used));
513 entry = archive_entry_new2(a);
514 archive_entry_set_pathname(entry, "\x95\x5C.txt");
515 /* Check the Unicode version. */
516 archive_entry_set_filetype(entry, AE_IFREG);
517 archive_entry_set_size(entry, 0);
518 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
519 archive_entry_free(entry);
520 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
522 /* Check UTF-8 version. */
523 assertEqualMem(buff + 512, "16 path=\xE8\xA1\xA8.txt\x0A", 16);
528 * Verify that KOI8-R filenames are not translated to Unicode and UTF-8
529 * when using hdrcharset=BINARY option.
532 test_pax_filename_encoding_KOI8R_BINARY(void)
535 struct archive_entry *entry;
539 if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
540 skipping("KOI8-R locale not available on this system.");
544 a = archive_write_new();
545 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
546 /* BINARY mode should be accepted. */
547 assertEqualInt(ARCHIVE_OK,
548 archive_write_set_options(a, "hdrcharset=BINARY"));
549 assertEqualInt(ARCHIVE_OK,
550 archive_write_open_memory(a, buff, sizeof(buff), &used));
552 entry = archive_entry_new2(a);
553 archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
554 archive_entry_set_filetype(entry, AE_IFREG);
555 archive_entry_set_size(entry, 0);
556 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
557 archive_entry_free(entry);
558 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
560 /* "hdrcharset=BINARY" pax attribute should be written. */
561 assertEqualMem(buff + 512, "21 hdrcharset=BINARY\x0A", 21);
562 /* Above three characters in KOI8-R should not translate to any
564 assertEqualMem(buff + 512+21, "12 path=\xD0\xD2\xC9\x0A", 12);
568 * Pax format writer only accepts both BINARY and UTF-8.
569 * If other character-set name is specified, you will get ARCHIVE_FAILED.
572 test_pax_filename_encoding_KOI8R_CP1251(void)
576 if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
577 skipping("KOI8-R locale not available on this system.");
581 a = archive_write_new();
582 assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
583 /* pax format writer only accepts both BINARY and UTF-8. */
584 assertEqualInt(ARCHIVE_FAILED,
585 archive_write_set_options(a, "hdrcharset=CP1251"));
586 assertEqualInt(ARCHIVE_OK, archive_write_free(a));
590 DEFINE_TEST(test_pax_filename_encoding)
592 test_pax_filename_encoding_1();
593 test_pax_filename_encoding_2();
594 test_pax_filename_encoding_3();
595 test_pax_filename_encoding_KOI8R();
596 test_pax_filename_encoding_CP1251();
597 test_pax_filename_encoding_EUCJP();
598 test_pax_filename_encoding_CP932();
599 test_pax_filename_encoding_KOI8R_BINARY();
600 test_pax_filename_encoding_KOI8R_CP1251();