]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - contrib/libarchive/libarchive/test/test_zip_filename_encoding.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / contrib / libarchive / libarchive / test / test_zip_filename_encoding.c
1 /*-
2  * Copyright (c) 2011 Michihiro NAKAJIMA
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 __FBSDID("$FreeBSD$");
27
28 #include <locale.h>
29
30 static void
31 test_zip_filename_encoding_UTF8(void)
32 {
33         struct archive *a;
34         struct archive_entry *entry;
35         char buff[4096];
36         size_t used;
37
38         if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
39                 skipping("en_US.UTF-8 locale not available on this system.");
40                 return;
41         }
42
43         /*
44          * Verify that UTF-8 filenames are correctly stored with
45          * hdrcharset=UTF-8 option.
46          */
47         a = archive_write_new();
48         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
49         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
50                 skipping("This system cannot convert character-set"
51                     " for UTF-8.");
52                 archive_write_free(a);
53                 return;
54         }
55         assertEqualInt(ARCHIVE_OK,
56             archive_write_open_memory(a, buff, sizeof(buff), &used));
57
58         entry = archive_entry_new2(a);
59         /* Set a UTF-8 filename. */
60         archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
61         archive_entry_set_filetype(entry, AE_IFREG);
62         archive_entry_set_size(entry, 0);
63         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
64         archive_entry_free(entry);
65         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
66
67         /* A bit 11 of general purpose flag should be 0x08,
68          * which indicates the filename charset is UTF-8. */
69         assertEqualInt(0x08, buff[7]);
70         assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
71
72         /*
73          * Verify that UTF-8 filenames are correctly stored without
74          * hdrcharset=UTF-8 option.
75          */
76         a = archive_write_new();
77         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
78         assertEqualInt(ARCHIVE_OK,
79             archive_write_open_memory(a, buff, sizeof(buff), &used));
80
81         entry = archive_entry_new2(a);
82         /* Set a UTF-8 filename. */
83         archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
84         archive_entry_set_filetype(entry, AE_IFREG);
85         archive_entry_set_size(entry, 0);
86         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
87         archive_entry_free(entry);
88         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
89
90         /* A bit 11 of general purpose flag should be 0x08,
91          * which indicates the filename charset is UTF-8. */
92         assertEqualInt(0x08, buff[7]);
93         assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
94
95         /*
96          * Verify that A bit 11 of general purpose flag is not set
97          * when ASCII filenames are stored.
98          */
99         a = archive_write_new();
100         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
101         assertEqualInt(ARCHIVE_OK,
102             archive_write_open_memory(a, buff, sizeof(buff), &used));
103
104         entry = archive_entry_new2(a);
105         /* Set an ASCII filename. */
106         archive_entry_set_pathname(entry, "abcABC");
107         archive_entry_set_filetype(entry, AE_IFREG);
108         archive_entry_set_size(entry, 0);
109         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
110         archive_entry_free(entry);
111         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
112
113         /* A bit 11 of general purpose flag should be 0,
114          * which indicates the filename charset is unknown. */
115         assertEqualInt(0, buff[7]);
116         assertEqualMem(buff + 30, "abcABC", 6);
117 }
118
119 static void
120 test_zip_filename_encoding_KOI8R(void)
121 {
122         struct archive *a;
123         struct archive_entry *entry;
124         char buff[4096];
125         size_t used;
126
127         if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
128                 skipping("KOI8-R locale not available on this system.");
129                 return;
130         }
131
132         /*
133          * Verify that KOI8-R filenames are correctly translated to UTF-8.
134          */
135         a = archive_write_new();
136         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
137         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
138                 skipping("This system cannot convert character-set"
139                     " from KOI8-R to UTF-8.");
140                 archive_write_free(a);
141                 return;
142         }
143         assertEqualInt(ARCHIVE_OK,
144             archive_write_open_memory(a, buff, sizeof(buff), &used));
145
146         entry = archive_entry_new2(a);
147         /* Set a KOI8-R filename. */
148         archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
149         archive_entry_set_filetype(entry, AE_IFREG);
150         archive_entry_set_size(entry, 0);
151         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
152         archive_entry_free(entry);
153         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
154
155         /* A bit 11 of general purpose flag should be 0x08,
156          * which indicates the filename charset is UTF-8. */
157         assertEqualInt(0x08, buff[7]);
158         /* Above three characters in KOI8-R should translate to the following
159          * three characters (two bytes each) in UTF-8. */
160         assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
161
162         /*
163          * Verify that KOI8-R filenames are not translated to UTF-8.
164          */
165         a = archive_write_new();
166         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
167         assertEqualInt(ARCHIVE_OK,
168             archive_write_open_memory(a, buff, sizeof(buff), &used));
169
170         entry = archive_entry_new2(a);
171         /* Set a KOI8-R filename. */
172         archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
173         archive_entry_set_filetype(entry, AE_IFREG);
174         archive_entry_set_size(entry, 0);
175         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
176         archive_entry_free(entry);
177         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
178
179         /* A bit 11 of general purpose flag should be 0,
180          * which indicates the filename charset is unknown. */
181         assertEqualInt(0, buff[7]);
182         /* Above three characters in KOI8-R should not translate to
183          * any character-set. */
184         assertEqualMem(buff + 30, "\xD0\xD2\xC9", 3);
185
186         /*
187          * Verify that A bit 11 of general purpose flag is not set
188          * when ASCII filenames are stored even if hdrcharset=UTF-8
189          * is specified.
190          */
191         a = archive_write_new();
192         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
193         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
194                 skipping("This system cannot convert character-set"
195                     " from KOI8-R to UTF-8.");
196                 archive_write_free(a);
197                 return;
198         }
199         assertEqualInt(ARCHIVE_OK,
200             archive_write_open_memory(a, buff, sizeof(buff), &used));
201
202         entry = archive_entry_new2(a);
203         /* Set an ASCII filename. */
204         archive_entry_set_pathname(entry, "abcABC");
205         archive_entry_set_filetype(entry, AE_IFREG);
206         archive_entry_set_size(entry, 0);
207         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
208         archive_entry_free(entry);
209         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
210
211         /* A bit 11 of general purpose flag should be 0,
212          * which indicates the filename charset is unknown. */
213         assertEqualInt(0, buff[7]);
214         assertEqualMem(buff + 30, "abcABC", 6);
215 }
216
217 /*
218  * Do not translate CP1251 into CP866 if non Windows platform.
219  */
220 static void
221 test_zip_filename_encoding_ru_RU_CP1251(void)
222 {
223         struct archive *a;
224         struct archive_entry *entry;
225         char buff[4096];
226         size_t used;
227
228         if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
229                 skipping("Russian_Russia locale not available on this system.");
230                 return;
231         }
232
233         /*
234          * Verify that CP1251 filenames are not translated into any
235          * other character-set, in particular, CP866.
236          */
237         a = archive_write_new();
238         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
239         assertEqualInt(ARCHIVE_OK,
240             archive_write_open_memory(a, buff, sizeof(buff), &used));
241
242         entry = archive_entry_new2(a);
243         /* Set a CP1251 filename. */
244         archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
245         archive_entry_set_filetype(entry, AE_IFREG);
246         archive_entry_set_size(entry, 0);
247         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
248         archive_entry_free(entry);
249         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
250
251         /* A bit 11 of general purpose flag should be 0,
252          * which indicates the filename charset is unknown. */
253         assertEqualInt(0, buff[7]);
254         /* Above three characters in CP1251 should not translate into
255          * any other character-set. */
256         assertEqualMem(buff + 30, "\xEF\xF0\xE8", 3);
257 }
258
259 /*
260  * Other archiver applications on Windows translate CP1251 filenames
261  * into CP866 filenames and store it in the zip file.
262  * Test above behavior works well.
263  */
264 static void
265 test_zip_filename_encoding_Russian_Russia(void)
266 {
267         struct archive *a;
268         struct archive_entry *entry;
269         char buff[4096];
270         size_t used;
271
272         if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
273                 skipping("Russian_Russia locale not available on this system.");
274                 return;
275         }
276
277         /*
278          * Verify that Russian_Russia(CP1251) filenames are correctly translated
279          * to UTF-8.
280          */
281         a = archive_write_new();
282         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
283         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
284                 skipping("This system cannot convert character-set"
285                     " from Russian_Russia.CP1251 to UTF-8.");
286                 archive_write_free(a);
287                 return;
288         }
289         assertEqualInt(ARCHIVE_OK,
290             archive_write_open_memory(a, buff, sizeof(buff), &used));
291
292         entry = archive_entry_new2(a);
293         /* Set a CP1251 filename. */
294         archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
295         archive_entry_set_filetype(entry, AE_IFREG);
296         archive_entry_set_size(entry, 0);
297         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
298         archive_entry_free(entry);
299         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
300
301         /* A bit 11 of general purpose flag should be 0x08,
302          * which indicates the filename charset is UTF-8. */
303         assertEqualInt(0x08, buff[7]);
304         /* Above three characters in CP1251 should translate to the following
305          * three characters (two bytes each) in UTF-8. */
306         assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
307
308         /*
309          * Verify that Russian_Russia(CP1251) filenames are correctly translated
310          * to CP866.
311          */
312         a = archive_write_new();
313         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
314         assertEqualInt(ARCHIVE_OK,
315             archive_write_open_memory(a, buff, sizeof(buff), &used));
316
317         entry = archive_entry_new2(a);
318         /* Set a CP1251 filename. */
319         archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
320         archive_entry_set_filetype(entry, AE_IFREG);
321         archive_entry_set_size(entry, 0);
322         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
323         archive_entry_free(entry);
324         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
325
326         /* A bit 11 of general purpose flag should be 0,
327          * which indicates the filename charset is unknown. */
328         assertEqualInt(0, buff[7]);
329         /* Above three characters in CP1251 should translate to the following
330          * three characters in CP866. */
331         assertEqualMem(buff + 30, "\xAF\xE0\xA8", 3);
332 }
333
334 static void
335 test_zip_filename_encoding_EUCJP(void)
336 {
337         struct archive *a;
338         struct archive_entry *entry;
339         char buff[4096];
340         size_t used;
341
342         if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
343                 skipping("eucJP locale not available on this system.");
344                 return;
345         }
346
347         /*
348          * Verify that EUC-JP filenames are correctly translated to UTF-8.
349          */
350         a = archive_write_new();
351         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
352         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
353                 skipping("This system cannot convert character-set"
354                     " from eucJP to UTF-8.");
355                 archive_write_free(a);
356                 return;
357         }
358         assertEqualInt(ARCHIVE_OK,
359             archive_write_open_memory(a, buff, sizeof(buff), &used));
360
361         entry = archive_entry_new2(a);
362         /* Set an EUC-JP filename. */
363         archive_entry_set_pathname(entry, "\xC9\xBD.txt");
364         /* Check the Unicode version. */
365         archive_entry_set_filetype(entry, AE_IFREG);
366         archive_entry_set_size(entry, 0);
367         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
368         archive_entry_free(entry);
369         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
370
371         /* A bit 11 of general purpose flag should be 0x08,
372          * which indicates the filename charset is UTF-8. */
373         assertEqualInt(0x08, buff[7]);
374         /* Check UTF-8 version. */
375         assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
376
377         /*
378          * Verify that EUC-JP filenames are not translated to UTF-8.
379          */
380         a = archive_write_new();
381         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
382         assertEqualInt(ARCHIVE_OK,
383             archive_write_open_memory(a, buff, sizeof(buff), &used));
384
385         entry = archive_entry_new2(a);
386         /* Set an EUC-JP filename. */
387         archive_entry_set_pathname(entry, "\xC9\xBD.txt");
388         /* Check the Unicode version. */
389         archive_entry_set_filetype(entry, AE_IFREG);
390         archive_entry_set_size(entry, 0);
391         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
392         archive_entry_free(entry);
393         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
394
395         /* A bit 11 of general purpose flag should be 0,
396          * which indicates the filename charset is unknown. */
397         assertEqualInt(0, buff[7]);
398         /* Above three characters in EUC-JP should not translate to
399          * any character-set. */
400         assertEqualMem(buff + 30, "\xC9\xBD.txt", 6);
401
402         /*
403          * Verify that A bit 11 of general purpose flag is not set
404          * when ASCII filenames are stored even if hdrcharset=UTF-8
405          * is specified.
406          */
407         a = archive_write_new();
408         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
409         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
410                 skipping("This system cannot convert character-set"
411                     " from eucJP to UTF-8.");
412                 archive_write_free(a);
413                 return;
414         }
415         assertEqualInt(ARCHIVE_OK,
416             archive_write_open_memory(a, buff, sizeof(buff), &used));
417
418         entry = archive_entry_new2(a);
419         /* Set an ASCII filename. */
420         archive_entry_set_pathname(entry, "abcABC");
421         /* Check the Unicode version. */
422         archive_entry_set_filetype(entry, AE_IFREG);
423         archive_entry_set_size(entry, 0);
424         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
425         archive_entry_free(entry);
426         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
427
428         /* A bit 11 of general purpose flag should be 0,
429          * which indicates the filename charset is unknown. */
430         assertEqualInt(0, buff[7]);
431         assertEqualMem(buff + 30, "abcABC", 6);
432 }
433
434 static void
435 test_zip_filename_encoding_CP932(void)
436 {
437         struct archive *a;
438         struct archive_entry *entry;
439         char buff[4096];
440         size_t used;
441
442         if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
443             NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
444                 skipping("CP932/SJIS locale not available on this system.");
445                 return;
446         }
447
448         /*
449          * Verify that EUC-JP filenames are correctly translated to UTF-8.
450          */
451         a = archive_write_new();
452         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
453         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
454                 skipping("This system cannot convert character-set"
455                     " from CP932/SJIS to UTF-8.");
456                 archive_write_free(a);
457                 return;
458         }
459         assertEqualInt(ARCHIVE_OK,
460             archive_write_open_memory(a, buff, sizeof(buff), &used));
461
462         entry = archive_entry_new2(a);
463         /* Set a CP932/SJIS filename. */
464         archive_entry_set_pathname(entry, "\x95\x5C.txt");
465         /* Check the Unicode version. */
466         archive_entry_set_filetype(entry, AE_IFREG);
467         archive_entry_set_size(entry, 0);
468         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
469         archive_entry_free(entry);
470         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
471
472         /* A bit 11 of general purpose flag should be 0x08,
473          * which indicates the filename charset is UTF-8. */
474         assertEqualInt(0x08, buff[7]);
475         /* Check UTF-8 version. */
476         assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
477
478         /*
479          * Verify that CP932/SJIS filenames are not translated to UTF-8.
480          */
481         a = archive_write_new();
482         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
483         assertEqualInt(ARCHIVE_OK,
484             archive_write_open_memory(a, buff, sizeof(buff), &used));
485
486         entry = archive_entry_new2(a);
487         /* Set a CP932/SJIS filename. */
488         archive_entry_set_pathname(entry, "\x95\x5C.txt");
489         /* Check the Unicode version. */
490         archive_entry_set_filetype(entry, AE_IFREG);
491         archive_entry_set_size(entry, 0);
492         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
493         archive_entry_free(entry);
494         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
495
496         /* A bit 11 of general purpose flag should be 0,
497          * which indicates the filename charset is unknown. */
498         assertEqualInt(0, buff[7]);
499         /* Above three characters in CP932/SJIS should not translate to
500          * any character-set. */
501         assertEqualMem(buff + 30, "\x95\x5C.txt", 6);
502
503         /*
504          * Verify that A bit 11 of general purpose flag is not set
505          * when ASCII filenames are stored even if hdrcharset=UTF-8
506          * is specified.
507          */
508         a = archive_write_new();
509         assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
510         if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
511                 skipping("This system cannot convert character-set"
512                     " from CP932/SJIS to UTF-8.");
513                 archive_write_free(a);
514                 return;
515         }
516         assertEqualInt(ARCHIVE_OK,
517             archive_write_open_memory(a, buff, sizeof(buff), &used));
518
519         entry = archive_entry_new2(a);
520         /* Set an ASCII filename. */
521         archive_entry_set_pathname(entry, "abcABC");
522         /* Check the Unicode version. */
523         archive_entry_set_filetype(entry, AE_IFREG);
524         archive_entry_set_size(entry, 0);
525         assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
526         archive_entry_free(entry);
527         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
528
529         /* A bit 11 of general purpose flag should be 0,
530          * which indicates the filename charset is unknown. */
531         assertEqualInt(0, buff[7]);
532         assertEqualMem(buff + 30, "abcABC", 6);
533 }
534
535 DEFINE_TEST(test_zip_filename_encoding)
536 {
537         test_zip_filename_encoding_UTF8();
538         test_zip_filename_encoding_KOI8R();
539         test_zip_filename_encoding_ru_RU_CP1251();
540         test_zip_filename_encoding_Russian_Russia();
541         test_zip_filename_encoding_EUCJP();
542         test_zip_filename_encoding_CP932();
543 }